diff --git a/awx/main/scheduler/kubernetes.py b/awx/main/scheduler/kubernetes.py index f4e35caec9..90f2849c3d 100644 --- a/awx/main/scheduler/kubernetes.py +++ b/awx/main/scheduler/kubernetes.py @@ -3,6 +3,7 @@ import stat import time import yaml import tempfile +import logging from base64 import b64encode from django.conf import settings @@ -11,6 +12,8 @@ from django.utils.functional import cached_property from awx.main.utils.common import parse_yaml_or_json +logger = logging.getLogger('awx.main.scheduler') + class PodManager(object): @@ -21,29 +24,33 @@ class PodManager(object): if not self.credential.kubernetes: raise RuntimeError('Pod deployment cannot occur without a Kubernetes credential') - self.kube_api.create_namespaced_pod(body=self.pod_definition, namespace=self.namespace, - _request_timeout=settings.AWX_CONTAINER_GROUP_DEFAULT_LAUNCH_TIMEOUT) + _request_timeout=settings.AWX_CONTAINER_GROUP_K8S_API_TIMEOUT) - while True: + num_retries = settings.AWX_CONTAINER_GROUP_POD_LAUNCH_RETRIES + for retry_attempt in range(num_retries - 1): + logger.debug(f"Checking for pod {self.pod_name}. Attempt {retry_attempt + 1} of {num_retries}") pod = self.kube_api.read_namespaced_pod(name=self.pod_name, namespace=self.namespace, - _request_timeout=settings.AWX_CONTAINER_GROUP_DEFAULT_LAUNCH_TIMEOUT) + _request_timeout=settings.AWX_CONTAINER_GROUP_K8S_API_TIMEOUT) if pod.status.phase != 'Pending': break - time.sleep(1) + else: + logger.debug(f"Pod {self.pod_name} is Pending.") + time.sleep(settings.AWX_CONTAINER_GROUP_POD_LAUNCH_RETRY_DELAY) + continue if pod.status.phase == 'Running': + logger.debug(f"Pod {self.pod_name} is online.") return pod else: - raise RuntimeError(f"Unhandled Pod phase: {pod.status.phase}") - + logger.warn(f"Pod {self.pod_name} did not start. Status is {pod.status.phase}.") def delete(self): return self.kube_api.delete_namespaced_pod(name=self.pod_name, namespace=self.namespace, - _request_timeout=settings.AWX_CONTAINER_GROUP_DEFAULT_LAUNCH_TIMEOUT) + _request_timeout=settings.AWX_CONTAINER_GROUP_K8S_API_TIMEOUT) @property def namespace(self): diff --git a/awx/main/tasks.py b/awx/main/tasks.py index 6253446cd5..d46965ff46 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -1405,13 +1405,15 @@ class BaseTask(object): if isinstance(exc, ApiException) and exc.status == 403: try: if 'exceeded quota' in json.loads(exc.body)['message']: - # If the k8s cluster does not have capacity, we move the job back into - # pending and immediately reschedule the task manager. + # If the k8s cluster does not have capacity, we move the + # job back into pending and wait until the next run of + # the task manager. This does not exactly play well with + # our current instance group precendence logic, since it + # will just sit here forever if kubernetes returns this + # error. logger.warn(exc.body) logger.warn(f"Could not launch pod for {log_name}. Exceeded quota.") - time.sleep(10) self.update_model(task.pk, status='pending') - schedule_task_manager() return except Exception: logger.exception(f"Unable to handle response from Kubernetes API for {log_name}.") @@ -1420,7 +1422,6 @@ class BaseTask(object): self.update_model(task.pk, status='error', result_traceback=exc.body) return - logger.debug(f"Pod online. Starting {log_name}.") self.update_model(task.pk, execution_node=pod_manager.pod_name) return pod_manager @@ -1833,7 +1834,10 @@ class RunJob(BaseTask): if job.is_containerized: from awx.main.scheduler.kubernetes import PodManager # prevent circular import - PodManager(job).delete() + pm = PodManager(job) + logger.debug(f"Deleting pod {pm.pod_name}") + pm.delete() + try: inventory = job.inventory diff --git a/awx/settings/defaults.py b/awx/settings/defaults.py index 639d19f9e8..07c76f8b01 100644 --- a/awx/settings/defaults.py +++ b/awx/settings/defaults.py @@ -67,7 +67,9 @@ DATABASES = { } } -AWX_CONTAINER_GROUP_DEFAULT_LAUNCH_TIMEOUT = 10 +AWX_CONTAINER_GROUP_K8S_API_TIMEOUT = 10 +AWX_CONTAINER_GROUP_POD_LAUNCH_RETRIES = 100 +AWX_CONTAINER_GROUP_POD_LAUNCH_RETRY_DELAY = 5 AWX_CONTAINER_GROUP_DEFAULT_NAMESPACE = 'default' AWX_CONTAINER_GROUP_DEFAULT_IMAGE = 'ansible/ansible-runner'