From 6f2307f50e0cb21d310b0899b45580434f796df9 Mon Sep 17 00:00:00 2001 From: Hao Liu <44379968+TheRealHaoLiu@users.noreply.github.com> Date: Thu, 27 Jun 2024 09:42:41 -0400 Subject: [PATCH] Add TASK_MANAGER_LOCK_TIMEOUT (#15300) * Add TASK_MANAGER_LOCK_TIMEOUT `TASK_MANAGER_LOCK_TIMEOUT` controls the `idle_in_transaction_session_timeout` and `idle_session_timeout` configuration for task manager connections and lock in database hope to prevent the situation that the task instance that holds the lock becomes unresponsive and preventing other instance to be able to run task manager * Add session timeout to periodic scheduler and all sub task manager locks --- awx/main/scheduler/task_manager.py | 3 ++- awx/main/tasks/system.py | 3 ++- awx/main/utils/pglock.py | 15 ++++++++++++++- awx/settings/defaults.py | 1 + 4 files changed, 19 insertions(+), 3 deletions(-) diff --git a/awx/main/scheduler/task_manager.py b/awx/main/scheduler/task_manager.py index 7e72d4cb8e..272779b3e5 100644 --- a/awx/main/scheduler/task_manager.py +++ b/awx/main/scheduler/task_manager.py @@ -138,7 +138,8 @@ class TaskBase: # Lock with task_manager_bulk_reschedule(): - with advisory_lock(f"{self.prefix}_lock", wait=False) as acquired: + lock_session_timeout_milliseconds = settings.TASK_MANAGER_LOCK_TIMEOUT * 1000 # convert to milliseconds + with advisory_lock(f"{self.prefix}_lock", lock_session_timeout_milliseconds=lock_session_timeout_milliseconds, wait=False) as acquired: with transaction.atomic(): if acquired is False: logger.debug(f"Not running {self.prefix} scheduler, another task holds lock") diff --git a/awx/main/tasks/system.py b/awx/main/tasks/system.py index aa65e706c1..c0f71bbc7f 100644 --- a/awx/main/tasks/system.py +++ b/awx/main/tasks/system.py @@ -715,7 +715,8 @@ def awx_k8s_reaper(): @task(queue=get_task_queuename) def awx_periodic_scheduler(): - with advisory_lock('awx_periodic_scheduler_lock', wait=False) as acquired: + lock_session_timeout_milliseconds = settings.TASK_MANAGER_LOCK_TIMEOUT * 1000 + with advisory_lock('awx_periodic_scheduler_lock', lock_session_timeout_milliseconds=lock_session_timeout_milliseconds, wait=False) as acquired: if acquired is False: logger.debug("Not running periodic scheduler, another task holds lock") return diff --git a/awx/main/utils/pglock.py b/awx/main/utils/pglock.py index 3d8a00d20a..a47880b667 100644 --- a/awx/main/utils/pglock.py +++ b/awx/main/utils/pglock.py @@ -8,9 +8,22 @@ from django.db import connection @contextmanager -def advisory_lock(*args, **kwargs): +def advisory_lock(*args, lock_session_timeout_milliseconds=0, **kwargs): if connection.vendor == 'postgresql': + cur = None + idle_in_transaction_session_timeout = None + idle_session_timeout = None + if lock_session_timeout_milliseconds > 0: + with connection.cursor() as cur: + idle_in_transaction_session_timeout = cur.execute('SHOW idle_in_transaction_session_timeout').fetchone()[0] + idle_session_timeout = cur.execute('SHOW idle_session_timeout').fetchone()[0] + cur.execute(f"SET idle_in_transaction_session_timeout = {lock_session_timeout_milliseconds}") + cur.execute(f"SET idle_session_timeout = {lock_session_timeout_milliseconds}") with django_pglocks_advisory_lock(*args, **kwargs) as internal_lock: yield internal_lock + if lock_session_timeout_milliseconds > 0: + with connection.cursor() as cur: + cur.execute(f"SET idle_in_transaction_session_timeout = {idle_in_transaction_session_timeout}") + cur.execute(f"SET idle_session_timeout = {idle_session_timeout}") else: yield True diff --git a/awx/settings/defaults.py b/awx/settings/defaults.py index 93c6e6f60d..546f68f50d 100644 --- a/awx/settings/defaults.py +++ b/awx/settings/defaults.py @@ -262,6 +262,7 @@ START_TASK_LIMIT = 100 # We have the grace period so the task manager can bail out before the timeout. TASK_MANAGER_TIMEOUT = 300 TASK_MANAGER_TIMEOUT_GRACE_PERIOD = 60 +TASK_MANAGER_LOCK_TIMEOUT = TASK_MANAGER_TIMEOUT + TASK_MANAGER_TIMEOUT_GRACE_PERIOD # Number of seconds _in addition to_ the task manager timeout a job can stay # in waiting without being reaped