mirror of
https://github.com/ansible/awx.git
synced 2026-01-16 04:10:44 -03:30
Merge pull request #2102 from ryanpetrello/fix-2085
WIP: add a background process to spot celery hangs and reload the worker pool
This commit is contained in:
commit
fa9b7106ff
61
awx/main/management/commands/watch_celery.py
Normal file
61
awx/main/management/commands/watch_celery.py
Normal file
@ -0,0 +1,61 @@
|
||||
import os
|
||||
import signal
|
||||
import subprocess
|
||||
import sys
|
||||
import socket
|
||||
import time
|
||||
|
||||
from celery import Celery
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
"""Watch local celery workers"""
|
||||
help=("Sends a periodic ping to the local celery process over AMQP to ensure "
|
||||
"it's responsive; this command is only intended to run in an environment "
|
||||
"where celeryd is running")
|
||||
|
||||
#
|
||||
# Just because celery is _running_ doesn't mean it's _working_; it's
|
||||
# imperative that celery workers are _actually_ handling AMQP messages on
|
||||
# their appropriate queues for awx to function. Unfortunately, we've been
|
||||
# plagued by a variety of bugs in celery that cause it to hang and become
|
||||
# an unresponsive zombie, such as:
|
||||
#
|
||||
# https://github.com/celery/celery/issues/4185
|
||||
# https://github.com/celery/celery/issues/4457
|
||||
#
|
||||
# The goal of this code is periodically send a broadcast AMQP message to
|
||||
# the celery process on the local host via celery.app.control.ping;
|
||||
# If that _fails_, we attempt to determine the pid of the celery process
|
||||
# and send SIGHUP (which tends to resolve these sorts of issues for us).
|
||||
#
|
||||
|
||||
INTERVAL = 60
|
||||
|
||||
def handle(self, **options):
|
||||
app = Celery('awx')
|
||||
app.config_from_object('django.conf:settings')
|
||||
while True:
|
||||
try:
|
||||
pongs = app.control.ping(['celery@{}'.format(socket.gethostname())])
|
||||
except:
|
||||
pongs = []
|
||||
if len(pongs):
|
||||
sys.stderr.write(str(pongs) + '\n')
|
||||
else:
|
||||
sys.stderr.write('celery is not responsive to ping over local AMQP\n')
|
||||
pid = self.getpid()
|
||||
if pid:
|
||||
sys.stderr.write('sending SIGHUP to {}\n'.format(pid))
|
||||
os.kill(pid, signal.SIGHUP)
|
||||
time.sleep(self.INTERVAL)
|
||||
|
||||
def getpid(self):
|
||||
cmd = 'supervisorctl pid tower-processes:awx-celeryd'
|
||||
if os.path.exists('/supervisor_task.conf'):
|
||||
cmd = 'supervisorctl -c /supervisor_task.conf pid tower-processes:celery'
|
||||
try:
|
||||
return int(subprocess.check_output(cmd, shell=True))
|
||||
except Exception:
|
||||
sys.stderr.write('could not detect celery pid\n')
|
||||
@ -15,6 +15,18 @@ stdout_logfile_maxbytes=0
|
||||
stderr_logfile=/dev/stderr
|
||||
stderr_logfile_maxbytes=0
|
||||
|
||||
[program:celery-watcher]
|
||||
command = /usr/bin/awx-manage celery_watcher
|
||||
directory = /var/lib/awx
|
||||
environment = LANGUAGE="en_US.UTF-8",LANG="en_US.UTF-8",LC_ALL="en_US.UTF-8",LC_CTYPE="en_US.UTF-8"
|
||||
autostart = true
|
||||
autorestart = true
|
||||
stopwaitsecs = 5
|
||||
stdout_logfile=/dev/stdout
|
||||
stdout_logfile_maxbytes=0
|
||||
stderr_logfile=/dev/stderr
|
||||
stderr_logfile_maxbytes=0
|
||||
|
||||
[program:callback-receiver]
|
||||
command = awx-manage run_callback_receiver
|
||||
directory = /var/lib/awx
|
||||
@ -38,7 +50,7 @@ stderr_logfile=/dev/stderr
|
||||
stderr_logfile_maxbytes=0
|
||||
|
||||
[group:tower-processes]
|
||||
programs=celery,callback-receiver,channels-worker
|
||||
programs=celery,celery-watcher,callback-receiver,channels-worker
|
||||
priority=5
|
||||
|
||||
# TODO: Exit Handler
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user