Allow wsrelay to fail without FATAL (#15191)

We have not identify the root cause of wsrelay failure but attempt to make wsrelay restart itself resulted in postgres and redis connection leak. We were not able to fully identify where the redis connection leak comes from so reverting back to failing and removing startsecs 30 will prevent wsrelay to FATAL
This commit is contained in:
Hao Liu
2024-05-20 19:34:12 -04:00
committed by GitHub
parent 7de350dc3e
commit fc9064e27f
3 changed files with 58 additions and 85 deletions

View File

@@ -101,8 +101,9 @@ class Command(BaseCommand):
migrating = bool(executor.migration_plan(executor.loader.graph.leaf_nodes()))
connection.close() # Because of async nature, main loop will use new connection, so close this
except Exception as exc:
logger.warning(f'Error on startup of run_wsrelay (error: {exc}), retry in 10s...')
time.sleep(10)
time.sleep(10) # Prevent supervisor from restarting the service too quickly and the service to enter FATAL state
# sleeping before logging because logging rely on setting which require database connection...
logger.warning(f'Error on startup of run_wsrelay (error: {exc}), slept for 10s...')
return
# In containerized deployments, migrations happen in the task container,
@@ -121,13 +122,14 @@ class Command(BaseCommand):
return
try:
my_hostname = Instance.objects.my_hostname()
my_hostname = Instance.objects.my_hostname() # This relies on settings.CLUSTER_HOST_ID which requires database connection
logger.info('Active instance with hostname {} is registered.'.format(my_hostname))
except RuntimeError as e:
# the CLUSTER_HOST_ID in the task, and web instance must match and
# ensure network connectivity between the task and web instance
logger.info('Unable to return currently active instance: {}, retry in 5s...'.format(e))
time.sleep(5)
time.sleep(10) # Prevent supervisor from restarting the service too quickly and the service to enter FATAL state
# sleeping before logging because logging rely on setting which require database connection...
logger.warning(f"Unable to return currently active instance: {e}, slept for 10s before return.")
return
if options.get('status'):
@@ -166,12 +168,14 @@ class Command(BaseCommand):
WebsocketsMetricsServer().start()
while True:
try:
asyncio.run(WebSocketRelayManager().run())
except KeyboardInterrupt:
logger.info('Shutting down Websocket Relayer')
break
except Exception as e:
logger.exception('Error in Websocket Relayer, exception: {}. Restarting in 10 seconds'.format(e))
time.sleep(10)
try:
logger.info('Starting Websocket Relayer...')
websocket_relay_manager = WebSocketRelayManager()
asyncio.run(websocket_relay_manager.run())
except KeyboardInterrupt:
logger.info('Terminating Websocket Relayer')
except BaseException as e: # BaseException is used to catch all exceptions including asyncio.CancelledError
time.sleep(10) # Prevent supervisor from restarting the service too quickly and the service to enter FATAL state
# sleeping before logging because logging rely on setting which require database connection...
logger.warning(f"Encounter error while running Websocket Relayer {e}, slept for 10s...")
return