AAP-70257 controller collection should retry transient HTTP errors with exponential backoff. (#16415)

controller collection should retry transient HTTP errors with exponential backoff
This commit is contained in:
Sean Sullivan
2026-04-21 10:12:08 -04:00
committed by GitHub
parent e5bae59f5a
commit d21e0141ce
3 changed files with 243 additions and 106 deletions

View File

@@ -55,6 +55,20 @@ options:
- Defaults to 10s, but this is handled by the shared module_utils code - Defaults to 10s, but this is handled by the shared module_utils code
type: float type: float
aliases: [ aap_request_timeout ] aliases: [ aap_request_timeout ]
max_retries:
description:
- Specify the max retries to be used with some connection issues.
- Defaults to 5.
- If value not set, will try environment variable C(AAP_MAX_RETRIES) and then config files.
type: int
aliases: [ aap_max_retries ]
retry_backoff_factor:
description:
- Backoff factor used when retrying connections.
- Defaults to 2.
- If value not set, will try environment variable C(AAP_RETRY_BACKOFF_FACTOR) and then config files.
type: int
aliases: [ aap_retry_backoff_factor ]
controller_config_file: controller_config_file:
description: description:
- Path to the controller config file. - Path to the controller config file.

View File

@@ -76,6 +76,24 @@ options:
why: Support for AAP variables why: Support for AAP variables
alternatives: 'AAP_REQUEST_TIMEOUT' alternatives: 'AAP_REQUEST_TIMEOUT'
aliases: [ aap_request_timeout ] aliases: [ aap_request_timeout ]
max_retries:
description:
- Specify the max retries to be used with some connection issues.
- Defaults to 5.
- This will not work with the export or import modules.
type: int
env:
- name: AAP_MAX_RETRIES
aliases: [ aap_max_retries ]
retry_backoff_factor:
description:
- Backoff factor used when retrying connections.
- Defaults to 2.
- This will not work with the export or import modules.
type: int
env:
- name: AAP_RETRY_BACKOFF_FACTOR
aliases: [ aap_retry_backoff_factor ]
notes: notes:
- If no I(config_file) is provided we will attempt to use the tower-cli library - If no I(config_file) is provided we will attempt to use the tower-cli library
defaults to find your host information. defaults to find your host information.

View File

@@ -15,6 +15,7 @@ from ansible.module_utils.six.moves.configparser import ConfigParser, NoOptionEr
from base64 import b64encode from base64 import b64encode
from socket import getaddrinfo, IPPROTO_TCP from socket import getaddrinfo, IPPROTO_TCP
import time import time
import random
from json import loads, dumps from json import loads, dumps
from os.path import isfile, expanduser, split, join, exists, isdir from os.path import isfile, expanduser, split, join, exists, isdir
from os import access, R_OK, getcwd, environ, getenv from os import access, R_OK, getcwd, environ, getenv
@@ -37,6 +38,19 @@ except ImportError:
CONTROLLER_BASE_PATH_ENV_VAR = "CONTROLLER_OPTIONAL_API_URLPATTERN_PREFIX" CONTROLLER_BASE_PATH_ENV_VAR = "CONTROLLER_OPTIONAL_API_URLPATTERN_PREFIX"
# 502/503: request never reached the server — always safe to retry any method
ALWAYS_RETRYABLE = {
502: ['GET', 'POST', 'PATCH', 'DELETE'], # Bad Gateway
503: ['GET', 'POST', 'PATCH', 'DELETE'], # Service Unavailable
}
# 500/504: idempotent methods only — GETs are reads, PATCH/DELETE are
# idempotent by definition; POST is excluded unless we know it's safe.
IDEMPOTENT_RETRYABLE = {
500: ['GET', 'PATCH', 'DELETE'], # Internal Server Error
504: ['GET', 'PATCH', 'DELETE'], # Gateway Timeout
}
class ConfigFileException(Exception): class ConfigFileException(Exception):
pass pass
@@ -72,6 +86,16 @@ class ControllerModule(AnsibleModule):
aliases=['aap_request_timeout'], aliases=['aap_request_timeout'],
required=False, required=False,
fallback=(env_fallback, ['CONTROLLER_REQUEST_TIMEOUT', 'AAP_REQUEST_TIMEOUT'])), fallback=(env_fallback, ['CONTROLLER_REQUEST_TIMEOUT', 'AAP_REQUEST_TIMEOUT'])),
max_retries=dict(
type='int',
aliases=['aap_max_retries'],
required=False,
fallback=(env_fallback, ['AAP_MAX_RETRIES'])),
retry_backoff_factor=dict(
type='int',
aliases=['aap_retry_backoff_factor'],
required=False,
fallback=(env_fallback, ['AAP_RETRY_BACKOFF_FACTOR'])),
aap_token=dict( aap_token=dict(
type='raw', type='raw',
no_log=True, no_log=True,
@@ -92,12 +116,16 @@ class ControllerModule(AnsibleModule):
'password': 'controller_password', 'password': 'controller_password',
'verify_ssl': 'validate_certs', 'verify_ssl': 'validate_certs',
'request_timeout': 'request_timeout', 'request_timeout': 'request_timeout',
'max_retries': 'max_retries',
'retry_backoff_factor': 'retry_backoff_factor',
} }
host = '127.0.0.1' host = '127.0.0.1'
username = None username = None
password = None password = None
verify_ssl = True verify_ssl = True
request_timeout = 10 request_timeout = 10
max_retries = 5
retry_backoff_factor = 2
authenticated = False authenticated = False
config_name = 'tower_cli.cfg' config_name = 'tower_cli.cfg'
version_checked = False version_checked = False
@@ -488,6 +516,49 @@ class ControllerAPIModule(ControllerModule):
def resolve_name_to_id(self, endpoint, name_or_id): def resolve_name_to_id(self, endpoint, name_or_id):
return self.get_exactly_one(endpoint, name_or_id)['id'] return self.get_exactly_one(endpoint, name_or_id)['id']
def is_retryable(self, status_code, method, endpoint):
"""
Determine whether a failed request is safe to retry.
Args:
status_code (int): HTTP status code returned by the server.
method (str): HTTP verb in uppercase ('GET', 'POST', etc.).
endpoint (str): The API endpoint path (e.g. '/api/v2/job_templates/1/launch/').
Returns:
bool: True if the request can safely be retried.
"""
# --- Always safe: 502/503 mean the request never reached AWX ---
if method in ALWAYS_RETRYABLE.get(status_code, []):
return True
# --- Safe for inherently idempotent methods (GET, PATCH, DELETE) ---
if method in IDEMPOTENT_RETRYABLE.get(status_code, []):
return True
# --- POST/PATCH on 500/504: safe UNLESS the endpoint triggers execution ---
if method in ('POST', 'PATCH') and status_code in (500, 504):
# /launch, /relaunch, /callback etc. — retrying would double-execute
# Catches: /job_templates/1/launch/, /workflow_job_templates/1/launch/,
# /jobs/1/relaunch/, /ad_hoc_commands/1/relaunch/ …
launch_keywords = ('/launch', '/relaunch', '/callback')
if any(kw in endpoint for kw in launch_keywords):
return False
# POST to the ad_hoc_commands collection root creates AND immediately
# executes the command — not safe to retry.
# PATCH to /ad_hoc_commands/<id>/ is fine (handled by PATCH branch above
# but would also pass through here correctly).
if method == 'POST' and endpoint.rstrip('/').endswith('/ad_hoc_commands'):
return False
# All other POST/PATCH endpoints (create resource, update resource) are
# safe: a 500/504 before the DB transaction commits means no side-effect.
return True
return False
def make_request(self, method, endpoint, *args, **kwargs): def make_request(self, method, endpoint, *args, **kwargs):
# In case someone is calling us directly; make sure we were given a method, let's not just assume a GET # In case someone is calling us directly; make sure we were given a method, let's not just assume a GET
if not method: if not method:
@@ -512,121 +583,155 @@ class ControllerAPIModule(ControllerModule):
headers.setdefault('Content-Type', 'application/json') headers.setdefault('Content-Type', 'application/json')
kwargs['headers'] = headers kwargs['headers'] = headers
data = None # Important, if content type is not JSON, this should not be dict type data = None
if headers.get('Content-Type', '') == 'application/json': if headers.get('Content-Type', '') == 'application/json':
data = dumps(kwargs.get('data', {})) data = dumps(kwargs.get('data', {}))
try: # ----------------------------------------------------------------
response = self.session.open( # Retry loop — wraps only the session.open() + HTTPError handling
method, url.geturl(), # Everything above (auth, URL building) happens once before the loop
headers=headers, # ----------------------------------------------------------------
timeout=self.request_timeout, max_retries = self.max_retries
validate_certs=self.verify_ssl, backoff_factor = self.retry_backoff_factor
follow_redirects=True, last_response = None
data=data
)
except (SSLValidationError) as ssl_err:
self.fail_json(msg="Could not establish a secure connection to your host ({1}): {0}.".format(url.netloc, ssl_err))
except (ConnectionError) as con_err:
self.fail_json(msg="There was a network error of some kind trying to connect to your host ({1}): {0}.".format(url.netloc, con_err))
except (HTTPError) as he:
# Sanity check: Did the server send back some kind of internal error?
if he.code >= 500:
self.fail_json(msg='The host sent back a server error ({1}): {0}. Please check the logs and try again later'.format(url.path, he))
# Sanity check: Did we fail to authenticate properly? If so, fail out now; this is always a failure.
elif he.code == 401:
self.fail_json(msg='Invalid authentication credentials for {0} (HTTP 401).'.format(url.path))
# Sanity check: Did we get a forbidden response, which means that the user isn't allowed to do this? Report that.
elif he.code == 403:
# Hack: Tell the customer to use the platform supported collection when interacting with Org, Team, User Controller endpoints
err_msg = he.fp.read().decode('utf-8')
try:
# Defensive coding. Handle json responses and non-json responses
err_msg = loads(err_msg)
err_msg = err_msg['detail']
# JSONDecodeError only available on Python 3.5+
except ValueError:
pass
prepend_msg = " Use the collection ansible.platform to modify resources Organization, User, or Team." if (
"this resource via the platform ingress") in err_msg else ""
self.fail_json(msg="You don't have permission to {1} to {0} (HTTP 403).{2}".format(url.path, method, prepend_msg))
# Sanity check: Did we get a 404 response?
# Requests with primary keys will return a 404 if there is no response, and we want to consistently trap these.
elif he.code == 404:
if kwargs.get('return_none_on_404', False):
return None
self.fail_json(msg='The requested object could not be found at {0}.'.format(url.path))
# Sanity check: Did we get a 405 response?
# A 405 means we used a method that isn't allowed. Usually this is a bad request, but it requires special treatment because the
# API sends it as a logic error in a few situations (e.g. trying to cancel a job that isn't running).
elif he.code == 405:
self.fail_json(msg="Cannot make a request with the {0} method to this endpoint {1}".format(method, url.path))
# Sanity check: Did we get some other kind of error? If so, write an appropriate error message.
elif he.code >= 400:
# We are going to return a 400 so the module can decide what to do with it
page_data = he.read()
try:
return {'status_code': he.code, 'json': loads(page_data)}
# JSONDecodeError only available on Python 3.5+
except ValueError:
return {'status_code': he.code, 'text': page_data}
elif he.code == 204 and method == 'DELETE':
# A 204 is a normal response for a delete function
pass
else:
self.fail_json(msg="Unexpected return code when calling {0}: {1}".format(url.geturl(), he))
except (Exception) as e:
self.fail_json(msg="There was an unknown error when trying to connect to {2}: {0} {1}".format(type(e).__name__, e, url.geturl()))
if not self.version_checked: for attempt in range(max_retries + 1): # attempt 0 = first try
# In PY2 we get back an HTTPResponse object but PY2 is returning an addinfourl
# First try to get the headers in PY3 format and then drop down to PY2.
try:
controller_type = response.getheader('X-API-Product-Name', None)
controller_version = response.getheader('X-API-Product-Version', None)
except Exception:
controller_type = response.info().getheader('X-API-Product-Name', None)
controller_version = response.info().getheader('X-API-Product-Version', None)
parsed_collection_version = Version(self._COLLECTION_VERSION).version if attempt > 0:
if controller_version: sleep_time = (backoff_factor ** (attempt - 1)) * (0.5 + random.random())
parsed_controller_version = Version(controller_version).version self.warn(
if controller_type == 'AWX': 'Retrying {0} {1} (attempt {2}/{3}) after {4}s due to status {5}'.format(
collection_compare_ver = parsed_collection_version[0] method, url.path, attempt, max_retries, sleep_time,
controller_compare_ver = parsed_controller_version[0] last_response if last_response else 'connection error'
else:
collection_compare_ver = "{0}.{1}".format(parsed_collection_version[0], parsed_collection_version[1])
controller_compare_ver = '{0}.{1}'.format(parsed_controller_version[0], parsed_controller_version[1])
if self._COLLECTION_TYPE not in self.collection_to_version or self.collection_to_version[self._COLLECTION_TYPE] != controller_type:
self.warn("You are using the {0} version of this collection but connecting to {1}".format(self._COLLECTION_TYPE, controller_type))
elif collection_compare_ver != controller_compare_ver:
self.warn(
"You are running collection version {0} but connecting to {2} version {1}".format(
self._COLLECTION_VERSION, controller_version, controller_type
)
) )
)
time.sleep(sleep_time)
self.version_checked = True
response_body = ''
try:
response_body = response.read()
except (Exception) as e:
self.fail_json(msg="Failed to read response body: {0}".format(e))
response_json = {}
if response_body and response_body != '':
try: try:
response_json = loads(response_body) response = self.session.open(
except (Exception) as e: method, url.geturl(),
self.fail_json(msg="Failed to parse the response json: {0}".format(e)) headers=headers,
timeout=self.request_timeout,
validate_certs=self.verify_ssl,
follow_redirects=True,
data=data
)
if PY2: except (SSLValidationError) as ssl_err:
status_code = response.getcode() # SSL errors are never retryable — cert problems won't fix themselves
else: self.fail_json(msg="Could not establish a secure connection to your host ({0}): {1}.".format(url.netloc, ssl_err))
status_code = response.status
return {'status_code': status_code, 'json': response_json} except (ConnectionError) as con_err:
# Connection errors may be transient — retry if we have attempts left
last_response = 'ConnectionError'
if attempt < max_retries:
continue
self.fail_json(msg="There was a network error of some kind trying to connect to your host ({0}): {1}.".format(url.netloc, con_err))
except (HTTPError) as he:
# ---- Retryable HTTP errors ----
if self.is_retryable(he.code, method, url.path):
# Exhausted retries on a retryable error go on to regular failure checks.
if attempt < max_retries:
continue
# Exhausted retries - provide informative message
self.fail_json(
msg="Request to {0} failed with status {1} after {2} retries. "
"This may indicate the server is overloaded.".format(url.path, he.code, max_retries)
)
# ---- Non-retryable HTTP errors (existing behaviour preserved) ----
if he.code >= 500:
self.fail_json(msg='The host sent back a server error ({1}): {0}. Please check the logs and try again later'.format(url.path, he))
elif he.code == 401:
self.fail_json(msg='Invalid authentication credentials for {0} (HTTP 401).'.format(url.path))
elif he.code == 403:
body = he.read()
raw = body.decode('utf-8') if isinstance(body, bytes) else str(body)
if 'unable to connect to database' in raw.lower():
if attempt < max_retries:
continue
self.fail_json(
msg="Request to {0} failed with status 403 (database unavailable) after {1} retries.".format(url.path, max_retries),
)
# Reuse raw instead of reading again
try:
err_msg = loads(raw)
err_msg = err_msg['detail']
except (ValueError, KeyError):
err_msg = raw
prepend_msg = " Use the collection ansible.platform to modify resources Organization, User, or Team." if (
"this resource via the platform ingress") in err_msg else ""
self.fail_json(msg="You don't have permission to {1} to {0} (HTTP 403).{2}".format(url.path, method, prepend_msg))
elif he.code == 404:
if kwargs.get('return_none_on_404', False):
return None
self.fail_json(msg='The requested object could not be found at {0}.'.format(url.path))
elif he.code == 405:
self.fail_json(msg="Cannot make a request with the {0} method to this endpoint {1}".format(method, url.path))
elif he.code >= 400:
page_data = he.read()
try:
return {'status_code': he.code, 'json': loads(page_data)}
except ValueError:
return {'status_code': he.code, 'text': page_data}
else:
self.fail_json(msg="Unexpected return code when calling {0}: {1}".format(url.geturl(), he))
except (Exception) as e:
self.fail_json(msg="There was an unknown error when trying to connect to {2}: {0} {1}".format(type(e).__name__, e, url.geturl()))
# ----------------------------------------------------------------
# Successful response — fall through from session.open()
# The version check and response parsing happen once on success
# ----------------------------------------------------------------
if not self.version_checked:
try:
controller_type = response.getheader('X-API-Product-Name', None)
controller_version = response.getheader('X-API-Product-Version', None)
except Exception:
controller_type = response.info().getheader('X-API-Product-Name', None)
controller_version = response.info().getheader('X-API-Product-Version', None)
parsed_collection_version = Version(self._COLLECTION_VERSION).version
if controller_version:
parsed_controller_version = Version(controller_version).version
if controller_type == 'AWX':
collection_compare_ver = parsed_collection_version[0]
controller_compare_ver = parsed_controller_version[0]
else:
collection_compare_ver = "{0}.{1}".format(parsed_collection_version[0], parsed_collection_version[1])
controller_compare_ver = '{0}.{1}'.format(parsed_controller_version[0], parsed_controller_version[1])
if self._COLLECTION_TYPE not in self.collection_to_version or self.collection_to_version[self._COLLECTION_TYPE] != controller_type:
self.warn("You are using the {0} version of this collection but connecting to {1}".format(self._COLLECTION_TYPE, controller_type))
elif collection_compare_ver != controller_compare_ver:
self.warn(
"You are running collection version {0} but connecting to {2} version {1}".format(
self._COLLECTION_VERSION, controller_version, controller_type
)
)
self.version_checked = True
response_body = ''
try:
response_body = response.read()
except (Exception) as e:
self.fail_json(msg="Failed to read response body: {0}".format(e))
response_json = {}
if response_body and response_body != '':
try:
response_json = loads(response_body)
except (Exception) as e:
self.fail_json(msg="Failed to parse the response json: {0}".format(e))
if PY2:
status_code = response.getcode()
else:
status_code = response.status
return {'status_code': status_code, 'json': response_json}
def api_path(self, app_key=None): def api_path(self, app_key=None):