AAP-57817 Add Redis connection retry using redis-py 7.0+ built-in (#16176)

* AAP-57817 Add Redis connection retry using redis-py 7.0+ built-in mechanism

* Refactor Redis client helpers to use settings and eliminate code duplication

* Create awx/main/utils/redis.py and move Redis client functions to avoid circular imports

* Fix subsystem_metrics to share Redis connection pool between
  client and pipeline

* Cache Redis clients in RelayConsumer and RelayWebsocketStatsManager to avoid creating new connection pools on every call

* Add cap and base config

* Add Redis retry logic with exponential backoff to handle connection failures during long-running operations

* Add REDIS_BACKOFF_CAP and REDIS_BACKOFF_BASE settings to allow
  adjustment of retry timing in worst-case scenarios without code changes

* Simplify Redis retry tests by removing unnecessary reload logic
This commit is contained in:
Lila Yasin
2025-12-01 09:08:47 -05:00
committed by GitHub
parent 0d86874d5d
commit 4f41b50a09
17 changed files with 264 additions and 24 deletions

View File

@@ -77,15 +77,34 @@ def swagger_autogen(requests=__SWAGGER_REQUESTS__):
class FakeRedis:
def __init__(self, *args, **kwargs):
# Accept and ignore all arguments to match redis.Redis signature
pass
def keys(self, *args, **kwargs):
return []
def set(self):
def set(self, *args, **kwargs):
pass
def get(self):
def get(self, *args, **kwargs):
return None
def rpush(self, *args, **kwargs):
return 1
def blpop(self, *args, **kwargs):
return None
def delete(self, *args, **kwargs):
pass
def llen(self, *args, **kwargs):
return 0
def scan_iter(self, *args, **kwargs):
return iter([])
@classmethod
def from_url(cls, *args, **kwargs):
return cls()

View File

@@ -1,6 +1,7 @@
import datetime
from unittest.mock import Mock, patch
from awx.main.analytics.broadcast_websocket import FixedSlidingWindow
from awx.main.analytics.broadcast_websocket import FixedSlidingWindow, RelayWebsocketStatsManager
from awx.main.analytics.broadcast_websocket import dt_to_seconds
@@ -59,3 +60,70 @@ class TestFixedSlidingWindow:
assert 20 - i == fsw.render(self.ts(minute=1, second=i, microsecond=0)), "E. Sliding window where 1 record() should drop from the results each time"
assert 0 == fsw.render(self.ts(minute=1, second=20, microsecond=0)), "F. First second one minute after all record() calls"
class TestRelayWebsocketStatsManager:
"""Test Redis client caching in RelayWebsocketStatsManager."""
def test_get_stats_sync_caches_redis_client(self):
"""Verify get_stats_sync caches Redis client to avoid creating new connection pools."""
# Reset class variable
RelayWebsocketStatsManager._redis_client = None
mock_redis = Mock()
mock_redis.get.return_value = b''
with patch('awx.main.analytics.broadcast_websocket.get_redis_client', return_value=mock_redis) as mock_get_client:
# First call should create client
RelayWebsocketStatsManager.get_stats_sync()
assert mock_get_client.call_count == 1
# Second call should reuse cached client
RelayWebsocketStatsManager.get_stats_sync()
assert mock_get_client.call_count == 1 # Still 1, not called again
# Third call should still reuse cached client
RelayWebsocketStatsManager.get_stats_sync()
assert mock_get_client.call_count == 1
# Cleanup
RelayWebsocketStatsManager._redis_client = None
def test_get_stats_sync_returns_parsed_metrics(self):
"""Verify get_stats_sync returns parsed metric families from Redis."""
# Reset class variable
RelayWebsocketStatsManager._redis_client = None
# Sample Prometheus metrics format
sample_metrics = b'# HELP test_metric A test metric\n# TYPE test_metric gauge\ntest_metric 42\n'
mock_redis = Mock()
mock_redis.get.return_value = sample_metrics
with patch('awx.main.analytics.broadcast_websocket.get_redis_client', return_value=mock_redis):
result = list(RelayWebsocketStatsManager.get_stats_sync())
# Should return parsed metric families
assert len(result) > 0
assert mock_redis.get.called
# Cleanup
RelayWebsocketStatsManager._redis_client = None
def test_get_stats_sync_handles_empty_redis_data(self):
"""Verify get_stats_sync handles empty data from Redis gracefully."""
# Reset class variable
RelayWebsocketStatsManager._redis_client = None
mock_redis = Mock()
mock_redis.get.return_value = None # Redis returns None when key doesn't exist
with patch('awx.main.analytics.broadcast_websocket.get_redis_client', return_value=mock_redis):
result = list(RelayWebsocketStatsManager.get_stats_sync())
# Should handle empty data gracefully
assert result == []
assert mock_redis.get.called
# Cleanup
RelayWebsocketStatsManager._redis_client = None

View File

@@ -0,0 +1,61 @@
# -*- coding: utf-8 -*-
# Copyright (c) 2025 Ansible, Inc.
# All Rights Reserved
from django.test.utils import override_settings
from awx.main.utils.redis import get_redis_client, get_redis_client_async
from redis.exceptions import BusyLoadingError, ConnectionError, TimeoutError
from redis.backoff import ExponentialBackoff
class TestRedisRetryConfiguration:
"""Verify Redis retry configuration is applied to connection objects."""
def test_retry_configuration_applied_to_client(self, settings):
"""Verify all retry settings are applied to the connection pool."""
# Test sync client
client = get_redis_client()
retry = client.connection_pool.connection_kwargs['retry']
backoff = retry._backoff
retry_errors = client.connection_pool.connection_kwargs['retry_on_error']
# Assert provided values match values on the object
assert retry._retries == settings.REDIS_RETRY_COUNT == 3
assert isinstance(backoff, ExponentialBackoff)
assert backoff._base == settings.REDIS_BACKOFF_BASE == 0.5
assert backoff._cap == settings.REDIS_BACKOFF_CAP == 1.0
assert BusyLoadingError in retry_errors
assert ConnectionError in retry_errors
assert TimeoutError in retry_errors
# Test async client has same config
client_async = get_redis_client_async()
retry_async = client_async.connection_pool.connection_kwargs['retry']
backoff_async = retry_async._backoff
retry_errors_async = client_async.connection_pool.connection_kwargs['retry_on_error']
assert retry_async._retries == settings.REDIS_RETRY_COUNT
assert backoff_async._base == settings.REDIS_BACKOFF_BASE
assert backoff_async._cap == settings.REDIS_BACKOFF_CAP
assert ConnectionError in retry_errors_async
@override_settings(REDIS_RETRY_COUNT=5)
def test_override_settings_applied_to_client(self):
"""Verify override_settings changes are applied to client object."""
client = get_redis_client()
retry = client.connection_pool.connection_kwargs['retry']
assert retry._retries == 5
@override_settings(REDIS_BACKOFF_CAP=2.0, REDIS_BACKOFF_BASE=1.0)
def test_override_backoff_settings_applied_to_client(self):
"""Verify override_settings for backoff parameters are applied to client object."""
client = get_redis_client()
retry = client.connection_pool.connection_kwargs['retry']
backoff = retry._backoff
# Assert provided values match values on object
assert backoff._cap == 2.0
assert backoff._base == 1.0