diff --git a/CHANGELOG.md b/CHANGELOG.md index 3d6a657365..eebbf8f026 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ This is a list of high-level changes for each release of AWX. A full list of com - Updated the AWX CLI to export labels associated with Workflow Job Templates - https://github.com/ansible/awx/pull/7847 - Updated to the latest python-ldap to address a bug - https://github.com/ansible/awx/issues/7868 - Upgraded git-python to fix a bug that caused workflows to sometimes fail - https://github.com/ansible/awx/issues/6119 +- Worked around a bug in the channels_redis library that slowly causes Daphne processes to leak memory over time - https://github.com/django/channels_redis/issues/212 - Fixed a bug in the AWX CLI that prevented Workflow nodes from importing properly - https://github.com/ansible/awx/issues/7793 - Fixed a bug in the awx.awx collection release process that templated the wrong version - https://github.com/ansible/awx/issues/7870 - Fixed a bug that caused errors rendering stdout that contained UTF-16 surrogate pairs - https://github.com/ansible/awx/pull/7918 diff --git a/awx/main/consumers.py b/awx/main/consumers.py index b6d8872ebd..d32219b3ac 100644 --- a/awx/main/consumers.py +++ b/awx/main/consumers.py @@ -1,3 +1,5 @@ +import collections +import functools import json import logging import time @@ -12,12 +14,40 @@ from django.contrib.auth.models import User from channels.generic.websocket import AsyncJsonWebsocketConsumer from channels.layers import get_channel_layer from channels.db import database_sync_to_async +from channels_redis.core import RedisChannelLayer logger = logging.getLogger('awx.main.consumers') XRF_KEY = '_auth_user_xrf' +class BoundedQueue(asyncio.Queue): + + def put_nowait(self, item): + if self.full(): + # dispose the oldest item + # if we actually get into this code block, it likely means that + # this specific consumer has stopped reading + # unfortunately, channels_redis will just happily continue to + # queue messages specific to their channel until the heat death + # of the sun: https://github.com/django/channels_redis/issues/212 + # this isn't a huge deal for browser clients that disconnect, + # but it *does* cause a problem for our global broadcast topic + # that's used to broadcast messages to peers in a cluster + # if we get into this code block, it's better to drop messages + # than to continue to malloc() forever + self.get_nowait() + return super(BoundedQueue, self).put_nowait(item) + + +class ExpiringRedisChannelLayer(RedisChannelLayer): + def __init__(self, *args, **kw): + super(ExpiringRedisChannelLayer, self).__init__(*args, **kw) + self.receive_buffer = collections.defaultdict( + functools.partial(BoundedQueue, self.capacity) + ) + + class WebsocketSecretAuthHelper: """ Middlewareish for websockets to verify node websocket broadcast interconnect. diff --git a/awx/settings/defaults.py b/awx/settings/defaults.py index 355d247f62..716aea3aa7 100644 --- a/awx/settings/defaults.py +++ b/awx/settings/defaults.py @@ -916,7 +916,7 @@ ASGI_APPLICATION = "awx.main.routing.application" CHANNEL_LAYERS = { "default": { - "BACKEND": "channels_redis.core.RedisChannelLayer", + "BACKEND": "awx.main.consumers.ExpiringRedisChannelLayer", "CONFIG": { "hosts": [BROKER_URL], "capacity": 10000,