Merge pull request #12527 from AlanCoding/offline_db

Further resiliency changes, specifically focused on case of database going offline
2026-05-01 06:35:26 -02:30 · 2022-08-17 10:10:50 -04:00
parent fad5934c1e 998000bfbe
commit 268ab128d7
6 changed files with 86 additions and 13 deletions
--- a/awx/conf/settings.py
+++ b/awx/conf/settings.py
@@ -80,7 +80,7 @@ def _ctit_db_wrapper(trans_safe=False):
        yield
    except DBError as exc:
        if trans_safe:
-            level = logger.exception
+            level = logger.warning
            if isinstance(exc, ProgrammingError):
                if 'relation' in str(exc) and 'does not exist' in str(exc):
                    # this generally means we can't fetch Tower configuration
@@ -89,7 +89,7 @@ def _ctit_db_wrapper(trans_safe=False):
                    # has come up *before* the database has finished migrating, and
                    # especially that the conf.settings table doesn't exist yet
                    level = logger.debug
-            level('Database settings are not available, using defaults.')
+            level(f'Database settings are not available, using defaults. error: {str(exc)}')
        else:
            logger.exception('Error modifying something related to database settings.')
    finally:
--- a/awx/main/analytics/subsystem_metrics.py
+++ b/awx/main/analytics/subsystem_metrics.py
@@ -313,7 +313,12 @@ class Metrics:
                self.previous_send_metrics.set(current_time)
                self.previous_send_metrics.store_value(self.conn)
        finally:
-            lock.release()
+            try:
+                lock.release()
+            except Exception as exc:
+                # After system failures, we might throw redis.exceptions.LockNotOwnedError
+                # this is to avoid print a Traceback, and importantly, avoid raising an exception into parent context
+                logger.warning(f'Error releasing subsystem metrics redis lock, error: {str(exc)}')

    def load_other_metrics(self, request):
        # data received from other nodes are stored in their own keys
--- a/awx/main/dispatch/worker/callback.py
+++ b/awx/main/dispatch/worker/callback.py
@@ -167,17 +167,28 @@ class CallbackBrokerWorker(BaseWorker):
                try:
                    cls.objects.bulk_create(events)
                    metrics_bulk_events_saved += len(events)
-                except Exception:
+                except Exception as exc:
+                    logger.warning(f'Error in events bulk_create, will try indiviually up to 5 errors, error {str(exc)}')
                    # if an exception occurs, we should re-attempt to save the
                    # events one-by-one, because something in the list is
                    # broken/stale
+                    consecutive_errors = 0
+                    events_saved = 0
                    metrics_events_batch_save_errors += 1
                    for e in events:
                        try:
                            e.save()
-                            metrics_singular_events_saved += 1
-                        except Exception:
-                            logger.exception('Database Error Saving Job Event')
+                            events_saved += 1
+                            consecutive_errors = 0
+                        except Exception as exc_indv:
+                            consecutive_errors += 1
+                            logger.info(f'Database Error Saving individual Job Event, error {str(exc_indv)}')
+
+                        if consecutive_errors >= 5:
+                            raise
+                    metrics_singular_events_saved += events_saved
+                    if events_saved == 0:
+                        raise
                metrics_duration_to_save = time.perf_counter() - metrics_duration_to_save
                for e in events:
                    if not getattr(e, '_skip_websocket_message', False):
@@ -257,17 +268,18 @@ class CallbackBrokerWorker(BaseWorker):
                try:
                    self.flush(force=flush)
                    break
-                except (OperationalError, InterfaceError, InternalError):
+                except (OperationalError, InterfaceError, InternalError) as exc:
                    if retries >= self.MAX_RETRIES:
                        logger.exception('Worker could not re-establish database connectivity, giving up on one or more events.')
                        return
                    delay = 60 * retries
-                    logger.exception('Database Error Saving Job Event, retry #{i} in {delay} seconds:'.format(i=retries + 1, delay=delay))
+                    logger.warning(f'Database Error Flushing Job Events, retry #{retries + 1} in {delay} seconds: {str(exc)}')
                    django_connection.close()
                    time.sleep(delay)
                    retries += 1
                except DatabaseError:
-                    logger.exception('Database Error Saving Job Event')
+                    logger.exception('Database Error Flushing Job Events')
+                    django_connection.close()
                    break
        except Exception as exc:
            tb = traceback.format_exc()
--- a/awx/main/management/commands/run_wsbroadcast.py
+++ b/awx/main/management/commands/run_wsbroadcast.py
@@ -95,8 +95,13 @@ class Command(BaseCommand):
        # database migrations are still running
        from awx.main.models.ha import Instance

-        executor = MigrationExecutor(connection)
-        migrating = bool(executor.migration_plan(executor.loader.graph.leaf_nodes()))
+        try:
+            executor = MigrationExecutor(connection)
+            migrating = bool(executor.migration_plan(executor.loader.graph.leaf_nodes()))
+        except Exception as exc:
+            logger.info(f'Error on startup of run_wsbroadcast (error: {exc}), retry in 10s...')
+            time.sleep(10)
+            return

        # In containerized deployments, migrations happen in the task container,
        # and the services running there don't start until migrations are
--- a/awx/main/tests/functional/utils/test_update_model.py
+++ b/awx/main/tests/functional/utils/test_update_model.py
@@ -0,0 +1,47 @@
+import pytest
+
+from django.db import DatabaseError
+
+from awx.main.models.jobs import Job
+from awx.main.utils.update_model import update_model
+
+
+@pytest.fixture
+def normal_job(deploy_jobtemplate):
+    return deploy_jobtemplate.create_unified_job()
+
+
+class NewException(Exception):
+    pass
+
+
+@pytest.mark.django_db
+def test_normal_get(normal_job):
+    mod_job = Job.objects.get(pk=normal_job.id)
+    mod_job.job_explanation = 'foobar'
+    mod_job.save(update_fields=['job_explanation'])
+    new_job = update_model(Job, normal_job.pk)
+    assert new_job.job_explanation == 'foobar'
+
+
+@pytest.mark.django_db
+def test_exception(normal_job, mocker):
+    mocker.patch.object(Job.objects, 'get', side_effect=DatabaseError)
+    mocker.patch('awx.main.utils.update_model.time.sleep')
+    with pytest.raises(DatabaseError):
+        update_model(Job, normal_job.pk)
+
+
+@pytest.mark.django_db
+def test_unknown_exception(normal_job, mocker):
+    mocker.patch.object(Job.objects, 'get', side_effect=NewException)
+    mocker.patch('awx.main.utils.update_model.time.sleep')
+    with pytest.raises(NewException):
+        update_model(Job, normal_job.pk)
+
+
+@pytest.mark.django_db
+def test_deleted_job(normal_job):
+    job_pk = normal_job.pk
+    normal_job.delete()
+    assert update_model(Job, job_pk) is None
--- a/awx/main/utils/update_model.py
+++ b/awx/main/utils/update_model.py
@@ -1,4 +1,5 @@
 from django.db import transaction, DatabaseError, InterfaceError
+from django.core.exceptions import ObjectDoesNotExist

 import logging
 import time
@@ -32,6 +33,8 @@ def update_model(model, pk, _attempt=0, _max_attempts=5, select_for_update=False
                        update_fields.append('failed')
                instance.save(update_fields=update_fields)
            return instance
+    except ObjectDoesNotExist:
+        return None
    except (DatabaseError, InterfaceError) as e:
        # Log out the error to the debug logger.
        logger.debug('Database error updating %s, retrying in 5 seconds (retry #%d): %s', model._meta.object_name, _attempt + 1, e)
@@ -45,4 +48,5 @@ def update_model(model, pk, _attempt=0, _max_attempts=5, select_for_update=False
                    raise RuntimeError(f'Could not fetch {pk} because of receiving abort signal')
            return update_model(model, pk, _attempt=_attempt + 1, _max_attempts=_max_attempts, **updates)
        else:
-            logger.error('Failed to update %s after %d retries.', model._meta.object_name, _attempt)
+            logger.warning(f'Failed to update {model._meta.object_name} pk={pk} after {_attempt} retries.')
+            raise