Merge pull request #2102 from ryanpetrello/fix-2085

WIP: add a background process to spot celery hangs and reload the worker pool
This commit is contained in:
Ryan Petrello 2018-06-11 12:44:18 -04:00 committed by GitHub
commit fa9b7106ff
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 74 additions and 1 deletions

View File

@ -0,0 +1,61 @@
import os
import signal
import subprocess
import sys
import socket
import time
from celery import Celery
from django.core.management.base import BaseCommand
class Command(BaseCommand):
"""Watch local celery workers"""
help=("Sends a periodic ping to the local celery process over AMQP to ensure "
"it's responsive; this command is only intended to run in an environment "
"where celeryd is running")
#
# Just because celery is _running_ doesn't mean it's _working_; it's
# imperative that celery workers are _actually_ handling AMQP messages on
# their appropriate queues for awx to function. Unfortunately, we've been
# plagued by a variety of bugs in celery that cause it to hang and become
# an unresponsive zombie, such as:
#
# https://github.com/celery/celery/issues/4185
# https://github.com/celery/celery/issues/4457
#
# The goal of this code is periodically send a broadcast AMQP message to
# the celery process on the local host via celery.app.control.ping;
# If that _fails_, we attempt to determine the pid of the celery process
# and send SIGHUP (which tends to resolve these sorts of issues for us).
#
INTERVAL = 60
def handle(self, **options):
app = Celery('awx')
app.config_from_object('django.conf:settings')
while True:
try:
pongs = app.control.ping(['celery@{}'.format(socket.gethostname())])
except:
pongs = []
if len(pongs):
sys.stderr.write(str(pongs) + '\n')
else:
sys.stderr.write('celery is not responsive to ping over local AMQP\n')
pid = self.getpid()
if pid:
sys.stderr.write('sending SIGHUP to {}\n'.format(pid))
os.kill(pid, signal.SIGHUP)
time.sleep(self.INTERVAL)
def getpid(self):
cmd = 'supervisorctl pid tower-processes:awx-celeryd'
if os.path.exists('/supervisor_task.conf'):
cmd = 'supervisorctl -c /supervisor_task.conf pid tower-processes:celery'
try:
return int(subprocess.check_output(cmd, shell=True))
except Exception:
sys.stderr.write('could not detect celery pid\n')

View File

@ -15,6 +15,18 @@ stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
[program:celery-watcher]
command = /usr/bin/awx-manage celery_watcher
directory = /var/lib/awx
environment = LANGUAGE="en_US.UTF-8",LANG="en_US.UTF-8",LC_ALL="en_US.UTF-8",LC_CTYPE="en_US.UTF-8"
autostart = true
autorestart = true
stopwaitsecs = 5
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
[program:callback-receiver]
command = awx-manage run_callback_receiver
directory = /var/lib/awx
@ -38,7 +50,7 @@ stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
[group:tower-processes]
programs=celery,callback-receiver,channels-worker
programs=celery,celery-watcher,callback-receiver,channels-worker
priority=5
# TODO: Exit Handler