mirror of
https://github.com/ansible/awx.git
synced 2026-01-11 01:57:35 -03:30
Implement retry logic for container group pod launches
This commit is contained in:
parent
b93164e1ed
commit
8f75382b81
@ -3,6 +3,7 @@ import stat
|
||||
import time
|
||||
import yaml
|
||||
import tempfile
|
||||
import logging
|
||||
from base64 import b64encode
|
||||
|
||||
from django.conf import settings
|
||||
@ -11,6 +12,8 @@ from django.utils.functional import cached_property
|
||||
|
||||
from awx.main.utils.common import parse_yaml_or_json
|
||||
|
||||
logger = logging.getLogger('awx.main.scheduler')
|
||||
|
||||
|
||||
class PodManager(object):
|
||||
|
||||
@ -21,29 +24,33 @@ class PodManager(object):
|
||||
if not self.credential.kubernetes:
|
||||
raise RuntimeError('Pod deployment cannot occur without a Kubernetes credential')
|
||||
|
||||
|
||||
self.kube_api.create_namespaced_pod(body=self.pod_definition,
|
||||
namespace=self.namespace,
|
||||
_request_timeout=settings.AWX_CONTAINER_GROUP_DEFAULT_LAUNCH_TIMEOUT)
|
||||
_request_timeout=settings.AWX_CONTAINER_GROUP_K8S_API_TIMEOUT)
|
||||
|
||||
while True:
|
||||
num_retries = settings.AWX_CONTAINER_GROUP_POD_LAUNCH_RETRIES
|
||||
for retry_attempt in range(num_retries - 1):
|
||||
logger.debug(f"Checking for pod {self.pod_name}. Attempt {retry_attempt + 1} of {num_retries}")
|
||||
pod = self.kube_api.read_namespaced_pod(name=self.pod_name,
|
||||
namespace=self.namespace,
|
||||
_request_timeout=settings.AWX_CONTAINER_GROUP_DEFAULT_LAUNCH_TIMEOUT)
|
||||
_request_timeout=settings.AWX_CONTAINER_GROUP_K8S_API_TIMEOUT)
|
||||
if pod.status.phase != 'Pending':
|
||||
break
|
||||
time.sleep(1)
|
||||
else:
|
||||
logger.debug(f"Pod {self.pod_name} is Pending.")
|
||||
time.sleep(settings.AWX_CONTAINER_GROUP_POD_LAUNCH_RETRY_DELAY)
|
||||
continue
|
||||
|
||||
if pod.status.phase == 'Running':
|
||||
logger.debug(f"Pod {self.pod_name} is online.")
|
||||
return pod
|
||||
else:
|
||||
raise RuntimeError(f"Unhandled Pod phase: {pod.status.phase}")
|
||||
|
||||
logger.warn(f"Pod {self.pod_name} did not start. Status is {pod.status.phase}.")
|
||||
|
||||
def delete(self):
|
||||
return self.kube_api.delete_namespaced_pod(name=self.pod_name,
|
||||
namespace=self.namespace,
|
||||
_request_timeout=settings.AWX_CONTAINER_GROUP_DEFAULT_LAUNCH_TIMEOUT)
|
||||
_request_timeout=settings.AWX_CONTAINER_GROUP_K8S_API_TIMEOUT)
|
||||
|
||||
@property
|
||||
def namespace(self):
|
||||
|
||||
@ -1405,13 +1405,15 @@ class BaseTask(object):
|
||||
if isinstance(exc, ApiException) and exc.status == 403:
|
||||
try:
|
||||
if 'exceeded quota' in json.loads(exc.body)['message']:
|
||||
# If the k8s cluster does not have capacity, we move the job back into
|
||||
# pending and immediately reschedule the task manager.
|
||||
# If the k8s cluster does not have capacity, we move the
|
||||
# job back into pending and wait until the next run of
|
||||
# the task manager. This does not exactly play well with
|
||||
# our current instance group precendence logic, since it
|
||||
# will just sit here forever if kubernetes returns this
|
||||
# error.
|
||||
logger.warn(exc.body)
|
||||
logger.warn(f"Could not launch pod for {log_name}. Exceeded quota.")
|
||||
time.sleep(10)
|
||||
self.update_model(task.pk, status='pending')
|
||||
schedule_task_manager()
|
||||
return
|
||||
except Exception:
|
||||
logger.exception(f"Unable to handle response from Kubernetes API for {log_name}.")
|
||||
@ -1420,7 +1422,6 @@ class BaseTask(object):
|
||||
self.update_model(task.pk, status='error', result_traceback=exc.body)
|
||||
return
|
||||
|
||||
logger.debug(f"Pod online. Starting {log_name}.")
|
||||
self.update_model(task.pk, execution_node=pod_manager.pod_name)
|
||||
return pod_manager
|
||||
|
||||
@ -1833,7 +1834,10 @@ class RunJob(BaseTask):
|
||||
|
||||
if job.is_containerized:
|
||||
from awx.main.scheduler.kubernetes import PodManager # prevent circular import
|
||||
PodManager(job).delete()
|
||||
pm = PodManager(job)
|
||||
logger.debug(f"Deleting pod {pm.pod_name}")
|
||||
pm.delete()
|
||||
|
||||
|
||||
try:
|
||||
inventory = job.inventory
|
||||
|
||||
@ -67,7 +67,9 @@ DATABASES = {
|
||||
}
|
||||
}
|
||||
|
||||
AWX_CONTAINER_GROUP_DEFAULT_LAUNCH_TIMEOUT = 10
|
||||
AWX_CONTAINER_GROUP_K8S_API_TIMEOUT = 10
|
||||
AWX_CONTAINER_GROUP_POD_LAUNCH_RETRIES = 100
|
||||
AWX_CONTAINER_GROUP_POD_LAUNCH_RETRY_DELAY = 5
|
||||
AWX_CONTAINER_GROUP_DEFAULT_NAMESPACE = 'default'
|
||||
AWX_CONTAINER_GROUP_DEFAULT_IMAGE = 'ansible/ansible-runner'
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user