From 30f556f8458563eb0bde23b958d815c9f068679f Mon Sep 17 00:00:00 2001 From: Alan Rominger Date: Fri, 15 Jul 2022 14:09:39 -0400 Subject: [PATCH] Further resiliency changes focused on offline database Make logs from database outage more manageable Raise exception if update_model never recovers from problem --- awx/conf/settings.py | 4 +- awx/main/analytics/subsystem_metrics.py | 7 ++- awx/main/dispatch/worker/callback.py | 25 +++++++--- .../management/commands/run_wsbroadcast.py | 9 +++- .../functional/utils/test_update_model.py | 47 +++++++++++++++++++ awx/main/utils/update_model.py | 6 ++- 6 files changed, 85 insertions(+), 13 deletions(-) create mode 100644 awx/main/tests/functional/utils/test_update_model.py diff --git a/awx/conf/settings.py b/awx/conf/settings.py index 9853d1bb01..70e40fadcc 100644 --- a/awx/conf/settings.py +++ b/awx/conf/settings.py @@ -80,7 +80,7 @@ def _ctit_db_wrapper(trans_safe=False): yield except DBError as exc: if trans_safe: - level = logger.exception + level = logger.warning if isinstance(exc, ProgrammingError): if 'relation' in str(exc) and 'does not exist' in str(exc): # this generally means we can't fetch Tower configuration @@ -89,7 +89,7 @@ def _ctit_db_wrapper(trans_safe=False): # has come up *before* the database has finished migrating, and # especially that the conf.settings table doesn't exist yet level = logger.debug - level('Database settings are not available, using defaults.') + level(f'Database settings are not available, using defaults. error: {str(exc)}') else: logger.exception('Error modifying something related to database settings.') finally: diff --git a/awx/main/analytics/subsystem_metrics.py b/awx/main/analytics/subsystem_metrics.py index d52e986c63..52dfcde27b 100644 --- a/awx/main/analytics/subsystem_metrics.py +++ b/awx/main/analytics/subsystem_metrics.py @@ -312,7 +312,12 @@ class Metrics: self.previous_send_metrics.set(current_time) self.previous_send_metrics.store_value(self.conn) finally: - lock.release() + try: + lock.release() + except Exception as exc: + # After system failures, we might throw redis.exceptions.LockNotOwnedError + # this is to avoid print a Traceback, and importantly, avoid raising an exception into parent context + logger.warning(f'Error releasing subsystem metrics redis lock, error: {str(exc)}') def load_other_metrics(self, request): # data received from other nodes are stored in their own keys diff --git a/awx/main/dispatch/worker/callback.py b/awx/main/dispatch/worker/callback.py index b54c3e7399..6b419a00d8 100644 --- a/awx/main/dispatch/worker/callback.py +++ b/awx/main/dispatch/worker/callback.py @@ -167,17 +167,27 @@ class CallbackBrokerWorker(BaseWorker): try: cls.objects.bulk_create(events) metrics_bulk_events_saved += len(events) - except Exception: + except Exception as exc: + logger.warning(f'Error in events bulk_create, will try indiviually up to 5 errors, error {str(exc)}') # if an exception occurs, we should re-attempt to save the # events one-by-one, because something in the list is # broken/stale + consecutive_errors = 0 + events_saved = 0 metrics_events_batch_save_errors += 1 for e in events: try: e.save() - metrics_singular_events_saved += 1 - except Exception: - logger.exception('Database Error Saving Job Event') + events_saved += 0 + consecutive_errors = 0 + except Exception as exc_indv: + consecutive_errors += 1 + if consecutive_errors >= 5: + raise + logger.info(f'Database Error Saving individual Job Event, error {str(exc_indv)}') + metrics_singular_events_saved += events_saved + if events_saved == 0: + raise metrics_duration_to_save = time.perf_counter() - metrics_duration_to_save for e in events: if not getattr(e, '_skip_websocket_message', False): @@ -257,17 +267,18 @@ class CallbackBrokerWorker(BaseWorker): try: self.flush(force=flush) break - except (OperationalError, InterfaceError, InternalError): + except (OperationalError, InterfaceError, InternalError) as exc: if retries >= self.MAX_RETRIES: logger.exception('Worker could not re-establish database connectivity, giving up on one or more events.') return delay = 60 * retries - logger.exception('Database Error Saving Job Event, retry #{i} in {delay} seconds:'.format(i=retries + 1, delay=delay)) + logger.warning(f'Database Error Flushing Job Events, retry #{retries + 1} in {delay} seconds: {str(exc)}') django_connection.close() time.sleep(delay) retries += 1 except DatabaseError: - logger.exception('Database Error Saving Job Event') + logger.exception('Database Error Flushing Job Events') + django_connection.close() break except Exception as exc: tb = traceback.format_exc() diff --git a/awx/main/management/commands/run_wsbroadcast.py b/awx/main/management/commands/run_wsbroadcast.py index dee2f842c5..f997415923 100644 --- a/awx/main/management/commands/run_wsbroadcast.py +++ b/awx/main/management/commands/run_wsbroadcast.py @@ -95,8 +95,13 @@ class Command(BaseCommand): # database migrations are still running from awx.main.models.ha import Instance - executor = MigrationExecutor(connection) - migrating = bool(executor.migration_plan(executor.loader.graph.leaf_nodes())) + try: + executor = MigrationExecutor(connection) + migrating = bool(executor.migration_plan(executor.loader.graph.leaf_nodes())) + except Exception as exc: + logger.info(f'Error on startup of run_wsbroadcast (error: {exc}), retry in 10s...') + time.sleep(10) + return # In containerized deployments, migrations happen in the task container, # and the services running there don't start until migrations are diff --git a/awx/main/tests/functional/utils/test_update_model.py b/awx/main/tests/functional/utils/test_update_model.py new file mode 100644 index 0000000000..94304cca30 --- /dev/null +++ b/awx/main/tests/functional/utils/test_update_model.py @@ -0,0 +1,47 @@ +import pytest + +from django.db import DatabaseError + +from awx.main.models.jobs import Job +from awx.main.utils.update_model import update_model + + +@pytest.fixture +def normal_job(deploy_jobtemplate): + return deploy_jobtemplate.create_unified_job() + + +class NewException(Exception): + pass + + +@pytest.mark.django_db +def test_normal_get(normal_job): + mod_job = Job.objects.get(pk=normal_job.id) + mod_job.job_explanation = 'foobar' + mod_job.save(update_fields=['job_explanation']) + new_job = update_model(Job, normal_job.pk) + assert new_job.job_explanation == 'foobar' + + +@pytest.mark.django_db +def test_exception(normal_job, mocker): + mocker.patch.object(Job.objects, 'get', side_effect=DatabaseError) + mocker.patch('awx.main.utils.update_model.time.sleep') + with pytest.raises(DatabaseError): + update_model(Job, normal_job.pk) + + +@pytest.mark.django_db +def test_unknown_exception(normal_job, mocker): + mocker.patch.object(Job.objects, 'get', side_effect=NewException) + mocker.patch('awx.main.utils.update_model.time.sleep') + with pytest.raises(NewException): + update_model(Job, normal_job.pk) + + +@pytest.mark.django_db +def test_deleted_job(normal_job): + job_pk = normal_job.pk + normal_job.delete() + assert update_model(Job, job_pk) is None diff --git a/awx/main/utils/update_model.py b/awx/main/utils/update_model.py index 80d930e2c5..0b2998561c 100644 --- a/awx/main/utils/update_model.py +++ b/awx/main/utils/update_model.py @@ -1,4 +1,5 @@ from django.db import transaction, DatabaseError, InterfaceError +from django.core.exceptions import ObjectDoesNotExist import logging import time @@ -32,6 +33,8 @@ def update_model(model, pk, _attempt=0, _max_attempts=5, select_for_update=False update_fields.append('failed') instance.save(update_fields=update_fields) return instance + except ObjectDoesNotExist: + return None except (DatabaseError, InterfaceError) as e: # Log out the error to the debug logger. logger.debug('Database error updating %s, retrying in 5 seconds (retry #%d): %s', model._meta.object_name, _attempt + 1, e) @@ -45,4 +48,5 @@ def update_model(model, pk, _attempt=0, _max_attempts=5, select_for_update=False raise RuntimeError(f'Could not fetch {pk} because of receiving abort signal') return update_model(model, pk, _attempt=_attempt + 1, _max_attempts=_max_attempts, **updates) else: - logger.error('Failed to update %s after %d retries.', model._meta.object_name, _attempt) + logger.warning(f'Failed to update {model._meta.object_name} pk={pk} after {_attempt} retries.') + raise