Add TASK_MANAGER_LOCK_TIMEOUT (#15300)

* Add TASK_MANAGER_LOCK_TIMEOUT

`TASK_MANAGER_LOCK_TIMEOUT` controls the `idle_in_transaction_session_timeout` and `idle_session_timeout` configuration for task manager connections and lock in database

hope to prevent the situation that the task instance that holds the lock becomes unresponsive and preventing other instance to be able to run task manager

* Add session timeout to periodic scheduler and all sub task manager locks
This commit is contained in:
Hao Liu 2024-06-27 09:42:41 -04:00 committed by GitHub
parent dbc2215bb6
commit 6f2307f50e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 19 additions and 3 deletions

View File

@ -138,7 +138,8 @@ class TaskBase:
# Lock
with task_manager_bulk_reschedule():
with advisory_lock(f"{self.prefix}_lock", wait=False) as acquired:
lock_session_timeout_milliseconds = settings.TASK_MANAGER_LOCK_TIMEOUT * 1000 # convert to milliseconds
with advisory_lock(f"{self.prefix}_lock", lock_session_timeout_milliseconds=lock_session_timeout_milliseconds, wait=False) as acquired:
with transaction.atomic():
if acquired is False:
logger.debug(f"Not running {self.prefix} scheduler, another task holds lock")

View File

@ -715,7 +715,8 @@ def awx_k8s_reaper():
@task(queue=get_task_queuename)
def awx_periodic_scheduler():
with advisory_lock('awx_periodic_scheduler_lock', wait=False) as acquired:
lock_session_timeout_milliseconds = settings.TASK_MANAGER_LOCK_TIMEOUT * 1000
with advisory_lock('awx_periodic_scheduler_lock', lock_session_timeout_milliseconds=lock_session_timeout_milliseconds, wait=False) as acquired:
if acquired is False:
logger.debug("Not running periodic scheduler, another task holds lock")
return

View File

@ -8,9 +8,22 @@ from django.db import connection
@contextmanager
def advisory_lock(*args, **kwargs):
def advisory_lock(*args, lock_session_timeout_milliseconds=0, **kwargs):
if connection.vendor == 'postgresql':
cur = None
idle_in_transaction_session_timeout = None
idle_session_timeout = None
if lock_session_timeout_milliseconds > 0:
with connection.cursor() as cur:
idle_in_transaction_session_timeout = cur.execute('SHOW idle_in_transaction_session_timeout').fetchone()[0]
idle_session_timeout = cur.execute('SHOW idle_session_timeout').fetchone()[0]
cur.execute(f"SET idle_in_transaction_session_timeout = {lock_session_timeout_milliseconds}")
cur.execute(f"SET idle_session_timeout = {lock_session_timeout_milliseconds}")
with django_pglocks_advisory_lock(*args, **kwargs) as internal_lock:
yield internal_lock
if lock_session_timeout_milliseconds > 0:
with connection.cursor() as cur:
cur.execute(f"SET idle_in_transaction_session_timeout = {idle_in_transaction_session_timeout}")
cur.execute(f"SET idle_session_timeout = {idle_session_timeout}")
else:
yield True

View File

@ -262,6 +262,7 @@ START_TASK_LIMIT = 100
# We have the grace period so the task manager can bail out before the timeout.
TASK_MANAGER_TIMEOUT = 300
TASK_MANAGER_TIMEOUT_GRACE_PERIOD = 60
TASK_MANAGER_LOCK_TIMEOUT = TASK_MANAGER_TIMEOUT + TASK_MANAGER_TIMEOUT_GRACE_PERIOD
# Number of seconds _in addition to_ the task manager timeout a job can stay
# in waiting without being reaped