Patches to make jobs robust to database restarts (#11905)

* Simple patches to make jobs robust to database restarts * Add some wait time before retrying loop due to DB error * Apply dispatcher downtime setting to job updates, fix dispatcher bug This resolves a bug where the pg_is_down property never had the right value the loop is normally stuck in the conn.events() iterator so it never recognized successful database interactions this lead to serial database outages terminating jobs New setting for allowable PG downtime is shared with task code any calls to update_model will use _max_attempts parameter to make it align with the patience time that the dispatcher respects when consuming new events * To avoid restart loops, handle DB errors on startup with prejudice * If reconnect consistently fails, exit with non-zero code
2026-03-03 09:48:51 -03:30 · 2022-03-30 09:14:20 -04:00
parent ef0f6ca248
commit 73e02e745a
5 changed files with 37 additions and 8 deletions
--- a/awx/main/dispatch/worker/base.py
+++ b/awx/main/dispatch/worker/base.py
@@ -134,6 +134,13 @@ class AWXConsumerRedis(AWXConsumerBase):


 class AWXConsumerPG(AWXConsumerBase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.pg_max_wait = settings.DISPATCHER_DB_DOWNTOWN_TOLLERANCE
+        # if no successful loops have ran since startup, then we should fail right away
+        self.pg_is_down = True  # set so that we fail if we get database errors on startup
+        self.pg_down_time = time.time() - self.pg_max_wait  # allow no grace period
+
    def run(self, *args, **kwargs):
        super(AWXConsumerPG, self).run(*args, **kwargs)

@@ -150,11 +157,28 @@ class AWXConsumerPG(AWXConsumerBase):
                        init = True
                    for e in conn.events():
                        self.process_task(json.loads(e.payload))
+                        self.pg_is_down = False
                    if self.should_stop:
                        return
            except psycopg2.InterfaceError:
                logger.warning("Stale Postgres message bus connection, reconnecting")
                continue
+            except (db.DatabaseError, psycopg2.OperationalError):
+                # If we have attained stady state operation, tolerate short-term database hickups
+                if not self.pg_is_down:
+                    logger.exception(f"Error consuming new events from postgres, will retry for {self.pg_max_wait} s")
+                    self.pg_down_time = time.time()
+                    self.pg_is_down = True
+                if time.time() - self.pg_down_time > self.pg_max_wait:
+                    logger.warning(f"Postgres event consumer has not recovered in {self.pg_max_wait} s, exiting")
+                    raise
+                # Wait for a second before next attempt, but still listen for any shutdown signals
+                for i in range(10):
+                    if self.should_stop:
+                        return
+                    time.sleep(0.1)
+                for conn in db.connections.all():
+                    conn.close_if_unusable_or_obsolete()


 class BaseWorker(object):