Major fixes for job callback receiver processing

* Add logic to ansible callback plugin to prevent it from waiting forever to submit events to Tower * Lower process recycle threshold for tower callback receiver * Make recycle threshold configurable * Properly exit the main callback receiver management process if the event receiver process is dead so we don't leave dead worker processes * Set a configurable maximum number of messages that can be waiting in a worker process queue before it is skipped instead of filling up memory on a dead worker process * Skip over a dead worker process if it's queue is full * Force restart callback receiver if all queues are dead * Roll back transaction.atomic with the thought that it is causing deadlocks in the worker process. Use the old commit_on_success mechanism with retry logic * Seperate queue nonblocking expected exception from any other type of exception that could be encountered on the queue fetch operation
2026-07-20 12:52:01 -02:30 · 2015-03-13 11:11:49 -04:00
parent 6258035ca8
commit 0f5beca9ae
3 changed files with 65 additions and 47 deletions
--- a/awx/plugins/callback/job_event_callback.py
+++ b/awx/plugins/callback/job_event_callback.py
@@ -121,6 +121,8 @@ class CallbackModule(object):
    def _start_connection(self):
        self.context = zmq.Context()
        self.socket = self.context.socket(zmq.REQ)
+        self.socket.setsockopt(zmq.RCVTIMEO, 4000)
+        self.socket.setsockopt(zmq.LINGER, 2000)
        self.socket.connect(self.callback_consumer_port)

    def _post_job_event_queue_msg(self, event, event_data):
@@ -146,16 +148,15 @@ class CallbackModule(object):
                    self._init_connection()
                if self.context is None:
                    self._start_connection()
-
                self.socket.send_json(msg)
                self.socket.recv()
                return
            except Exception, e:
-                self.logger.info('Publish Exception: %r, retry=%d', e,
+                self.logger.info('Publish Job Event Exception: %r, retry=%d', e,
                                 retry_count, exc_info=True)
-                # TODO: Maybe recycle connection here?
+                retry_count += 1
                if retry_count >= 3:
-                    raise
+                    break

    def _post_rest_api_event(self, event, event_data):
        data = json.dumps({