AC-1015 Added retry count for saving to database, minor retry refactor for publishing job events.

2026-03-08 21:19:26 -02:30 · 2014-02-09 13:51:40 -05:00
parent a3ee83258c
commit cb8b58b0ae
2 changed files with 23 additions and 15 deletions
--- a/awx/main/tasks.py
+++ b/awx/main/tasks.py
@@ -94,7 +94,7 @@ class BaseTask(Task):
        # Commit outstanding transaction so that we fetch the latest object
        # from the database.
        transaction.commit()
-        while True:
+        for retry_count in xrange(5):
            try:
                instance = self.model.objects.get(pk=pk)
                if updates:
@@ -109,12 +109,16 @@ class BaseTask(Task):
                            update_fields.append('failed')
                    instance.save(update_fields=update_fields)
                    transaction.commit()
-                break
+                return instance
            except DatabaseError as e:
                transaction.rollback()
-                logger.debug("Database error encountered, retrying in 5 seconds: " + str(e))
+                logger.debug('Database error updating %s, retrying in 5 '
+                             'seconds (retry #%d): %s',
+                             self.model._meta.object_name, retry_count + 1, e)
                time.sleep(5)
-        return instance
+        else:
+            logger.error('Failed to update %s after %d retries.',
+                         self.model._meta.object_name, retry_count)

    def get_model(self, pk):
        return self.model.objects.get(pk=pk)
@@ -556,7 +560,7 @@ class SaveJobEvents(Task):
        data['task'] = data.get('event_data', {}).get('task', '').strip()
        duplicate = False
        if event != '__complete__':
-            while True:
+            for retry_count in xrange(11):
                try:
                    # Commit any outstanding events before saving stats.
                    if event == 'playbook_on_stats':
@@ -573,8 +577,12 @@ class SaveJobEvents(Task):
                    break
                except DatabaseError as e:
                    transaction.rollback()
-                    logger.debug("Database error encountered, retrying in 1 second: " + str(e))
+                    logger.debug('Database error saving job event, retrying in '
+                                 '1 second (retry #%d): %s', retry_count + 1, e)
                    time.sleep(1)
+            else:
+                logger.error('Failed to save job event after %d retries.',
+                             retry_count)
        if not duplicate:
            if event not in events_received:
                events_received[event] = 1
--- a/awx/plugins/callback/job_event_callback.py
+++ b/awx/plugins/callback/job_event_callback.py
@@ -161,8 +161,7 @@ class CallbackModule(object):
            msg.update({
                'pid': os.getpid(),
            })
-        retry_count = 0
-        while True:
+        for retry_count in xrange(4):
            try:
                if not hasattr(self, 'connection_pid'):
                    self.connection_pid = os.getpid()
@@ -170,14 +169,16 @@ class CallbackModule(object):
                    self._cleanup_connection()
                if not hasattr(self, 'connection'):
                    self.connection = Connection(self.broker_url, transport_options={'confirm_publish': True})
-                    self.logger.debug('New Connection: %r, retry=%d', self.connection, retry_count)
+                    self.logger.debug('New Connection: %r, retry=%d',
+                                      self.connection, retry_count)
                if not hasattr(self, 'producer'):
                    channel = self.connection.channel()
                    self.producer = self.connection.Producer(channel, exchange=self.job_events_exchange, serializer='json')
                    self.publish = self.connection.ensure(self.producer, self.producer.publish,
                                                          errback=self._publish_errback,
                                                          max_retries=3, interval_start=1, interval_step=1, interval_max=10)
-                    self.logger.debug('New Producer: %r, retry=%d', self.producer, retry_count)
+                    self.logger.debug('New Producer: %r, retry=%d',
+                                      self.producer, retry_count)
                self.logger.debug('Publish: %r, retry=%d', msg, retry_count)
                self.publish(msg, exchange=self.job_events_exchange,
                             routing_key=('job_events[%d]' % self.job_id),
@@ -186,12 +187,11 @@ class CallbackModule(object):
                    self._cleanup_connection()
                return
            except Exception, e:
-                self.logger.info('Publish Exception: %r, retry=%d', e, retry_count, exc_info=True)
-                if retry_count < 3:
-                    self._cleanup_connection()
-                else:
+                self.logger.info('Publish Exception: %r, retry=%d', e,
+                                 retry_count, exc_info=True)
+                self._cleanup_connection()
+                if retry_count >= 3:
                    raise
-            retry_count += 1

    def _post_rest_api_event(self, event, event_data):
        data = json.dumps({