move code linting to a stricter pep8-esque auto-formatting tool, black

This commit is contained in:
Ryan Petrello
2021-03-19 12:44:51 -04:00
parent 9b702e46fe
commit c2ef0a6500
671 changed files with 20538 additions and 21924 deletions

View File

@@ -85,10 +85,7 @@ class SimpleDAG(object):
color = 'red'
elif obj.do_not_run is True:
color = 'gray'
doc += "%s [color = %s]\n" % (
run_status(n['node_object']),
color
)
doc += "%s [color = %s]\n" % (run_status(n['node_object']), color)
for label, edges in self.node_from_edges_by_label.items():
for from_node, to_nodes in edges.items():
for to_node in to_nodes:
@@ -104,9 +101,9 @@ class SimpleDAG(object):
def add_node(self, obj, metadata=None):
if self.find_ord(obj) is None:
'''
"""
Assume node is a root node until a child is added
'''
"""
node_index = len(self.nodes)
self.root_nodes.add(node_index)
self.node_obj_to_node_index[obj] = node_index
@@ -129,10 +126,8 @@ class SimpleDAG(object):
elif to_obj_ord is None:
raise LookupError("To object not found {}".format(to_obj))
self.node_from_edges_by_label.setdefault(label, dict()) \
.setdefault(from_obj_ord, [])
self.node_to_edges_by_label.setdefault(label, dict()) \
.setdefault(to_obj_ord, [])
self.node_from_edges_by_label.setdefault(label, dict()).setdefault(from_obj_ord, [])
self.node_to_edges_by_label.setdefault(label, dict()).setdefault(to_obj_ord, [])
self.node_from_edges_by_label[label][from_obj_ord].append(to_obj_ord)
self.node_to_edges_by_label[label][to_obj_ord].append(from_obj_ord)
@@ -141,9 +136,7 @@ class SimpleDAG(object):
return self.node_obj_to_node_index.get(obj, None)
def _get_children_by_label(self, node_index, label):
return [self.nodes[index] for index in
self.node_from_edges_by_label.get(label, {})
.get(node_index, [])]
return [self.nodes[index] for index in self.node_from_edges_by_label.get(label, {}).get(node_index, [])]
def get_children(self, obj, label=None):
this_ord = self.find_ord(obj)
@@ -157,9 +150,7 @@ class SimpleDAG(object):
return nodes
def _get_parents_by_label(self, node_index, label):
return [self.nodes[index] for index in
self.node_to_edges_by_label.get(label, {})
.get(node_index, [])]
return [self.nodes[index] for index in self.node_to_edges_by_label.get(label, {}).get(node_index, [])]
def get_parents(self, obj, label=None):
this_ord = self.find_ord(obj)

View File

@@ -1,4 +1,3 @@
from django.utils.translation import ugettext_lazy as _
from django.utils.encoding import smart_text
@@ -21,18 +20,14 @@ class WorkflowDAG(SimpleDAG):
def _init_graph(self, workflow_job_or_jt):
if hasattr(workflow_job_or_jt, 'workflow_job_template_nodes'):
vals = ['from_workflowjobtemplatenode_id', 'to_workflowjobtemplatenode_id']
filters = {
'from_workflowjobtemplatenode__workflow_job_template_id': workflow_job_or_jt.id
}
filters = {'from_workflowjobtemplatenode__workflow_job_template_id': workflow_job_or_jt.id}
workflow_nodes = workflow_job_or_jt.workflow_job_template_nodes
success_nodes = WorkflowJobTemplateNode.success_nodes.through.objects.filter(**filters).values_list(*vals)
failure_nodes = WorkflowJobTemplateNode.failure_nodes.through.objects.filter(**filters).values_list(*vals)
always_nodes = WorkflowJobTemplateNode.always_nodes.through.objects.filter(**filters).values_list(*vals)
elif hasattr(workflow_job_or_jt, 'workflow_job_nodes'):
vals = ['from_workflowjobnode_id', 'to_workflowjobnode_id']
filters = {
'from_workflowjobnode__workflow_job_id': workflow_job_or_jt.id
}
filters = {'from_workflowjobnode__workflow_job_id': workflow_job_or_jt.id}
workflow_nodes = workflow_job_or_jt.workflow_job_nodes
success_nodes = WorkflowJobNode.success_nodes.through.objects.filter(**filters).values_list(*vals)
failure_nodes = WorkflowJobNode.failure_nodes.through.objects.filter(**filters).values_list(*vals)
@@ -76,15 +71,16 @@ class WorkflowDAG(SimpleDAG):
obj = node['node_object']
parent_nodes = [p['node_object'] for p in self.get_parents(obj)]
for p in parent_nodes:
#node has a status
# node has a status
if p.job and p.job.status in ["successful", "failed"]:
if p.job and p.job.status == "successful":
status = "success_nodes"
elif p.job and p.job.status == "failed":
status = "failure_nodes"
#check that the nodes status matches either a pathway of the same status or is an always path.
if (p not in [node['node_object'] for node in self.get_parents(obj, status)] and
p not in [node['node_object'] for node in self.get_parents(obj, "always_nodes")]):
# check that the nodes status matches either a pathway of the same status or is an always path.
if p not in [node['node_object'] for node in self.get_parents(obj, status)] and p not in [
node['node_object'] for node in self.get_parents(obj, "always_nodes")
]:
return False
return True
@@ -101,14 +97,11 @@ class WorkflowDAG(SimpleDAG):
continue
elif obj.job:
if obj.job.status in ['failed', 'error', 'canceled']:
nodes.extend(self.get_children(obj, 'failure_nodes') +
self.get_children(obj, 'always_nodes'))
nodes.extend(self.get_children(obj, 'failure_nodes') + self.get_children(obj, 'always_nodes'))
elif obj.job.status == 'successful':
nodes.extend(self.get_children(obj, 'success_nodes') +
self.get_children(obj, 'always_nodes'))
nodes.extend(self.get_children(obj, 'success_nodes') + self.get_children(obj, 'always_nodes'))
elif obj.unified_job_template is None:
nodes.extend(self.get_children(obj, 'failure_nodes') +
self.get_children(obj, 'always_nodes'))
nodes.extend(self.get_children(obj, 'failure_nodes') + self.get_children(obj, 'always_nodes'))
else:
# This catches root nodes or ANY convergence nodes
if not obj.all_parents_must_converge and self._are_relevant_parents_finished(n):
@@ -157,8 +150,7 @@ class WorkflowDAG(SimpleDAG):
for node in failed_nodes:
obj = node['node_object']
if (len(self.get_children(obj, 'failure_nodes')) +
len(self.get_children(obj, 'always_nodes'))) == 0:
if (len(self.get_children(obj, 'failure_nodes')) + len(self.get_children(obj, 'always_nodes'))) == 0:
if obj.unified_job_template is None:
res = True
failed_unified_job_template_node_ids.append(str(obj.id))
@@ -167,8 +159,10 @@ class WorkflowDAG(SimpleDAG):
failed_path_nodes_id_status.append((str(obj.id), obj.job.status))
if res is True:
s = _("No error handling path for workflow job node(s) [{node_status}]. Workflow job "
"node(s) missing unified job template and error handling path [{no_ufjt}].")
s = _(
"No error handling path for workflow job node(s) [{node_status}]. Workflow job "
"node(s) missing unified job template and error handling path [{no_ufjt}]."
)
parms = {
'node_status': '',
'no_ufjt': '',
@@ -190,13 +184,13 @@ class WorkflowDAG(SimpleDAG):
Return a boolean
'''
def _are_all_nodes_dnr_decided(self, workflow_nodes):
for n in workflow_nodes:
if n.do_not_run is False and not n.job and n.unified_job_template:
return False
return True
r'''
Determine if a node (1) is ready to be marked do_not_run and (2) should
be marked do_not_run.
@@ -206,30 +200,27 @@ class WorkflowDAG(SimpleDAG):
Return a boolean
'''
def _should_mark_node_dnr(self, node, parent_nodes):
for p in parent_nodes:
if p.do_not_run is True:
pass
elif p.job:
if p.job.status == 'successful':
if node in (self.get_children(p, 'success_nodes') +
self.get_children(p, 'always_nodes')):
if node in (self.get_children(p, 'success_nodes') + self.get_children(p, 'always_nodes')):
return False
elif p.job.status in ['failed', 'error', 'canceled']:
if node in (self.get_children(p, 'failure_nodes') +
self.get_children(p, 'always_nodes')):
if node in (self.get_children(p, 'failure_nodes') + self.get_children(p, 'always_nodes')):
return False
else:
return False
elif not p.do_not_run and p.unified_job_template is None:
if node in (self.get_children(p, 'failure_nodes') +
self.get_children(p, 'always_nodes')):
if node in (self.get_children(p, 'failure_nodes') + self.get_children(p, 'always_nodes')):
return False
else:
return False
return True
r'''
determine if the current node is a convergence node by checking if all the
parents are finished then checking to see if all parents meet the needed
@@ -238,6 +229,7 @@ class WorkflowDAG(SimpleDAG):
Return a list object
'''
def mark_dnr_nodes(self):
root_nodes = self.get_root_nodes()
nodes_marked_do_not_run = []

View File

@@ -22,8 +22,7 @@ def deepmerge(a, b):
{'first': {'all_rows': {'fail': 'cat', 'number': '5', 'pass': 'dog'}}}
"""
if isinstance(a, dict) and isinstance(b, dict):
return dict([(k, deepmerge(a.get(k), b.get(k)))
for k in set(a.keys()).union(b.keys())])
return dict([(k, deepmerge(a.get(k), b.get(k))) for k in set(a.keys()).union(b.keys())])
elif b is None:
return a
else:
@@ -31,7 +30,6 @@ def deepmerge(a, b):
class PodManager(object):
def __init__(self, task=None):
self.task = task
@@ -39,16 +37,12 @@ class PodManager(object):
if not self.credential.kubernetes:
raise RuntimeError('Pod deployment cannot occur without a Kubernetes credential')
self.kube_api.create_namespaced_pod(body=self.pod_definition,
namespace=self.namespace,
_request_timeout=settings.AWX_CONTAINER_GROUP_K8S_API_TIMEOUT)
self.kube_api.create_namespaced_pod(body=self.pod_definition, namespace=self.namespace, _request_timeout=settings.AWX_CONTAINER_GROUP_K8S_API_TIMEOUT)
num_retries = settings.AWX_CONTAINER_GROUP_POD_LAUNCH_RETRIES
for retry_attempt in range(num_retries - 1):
logger.debug(f"Checking for pod {self.pod_name}. Attempt {retry_attempt + 1} of {num_retries}")
pod = self.kube_api.read_namespaced_pod(name=self.pod_name,
namespace=self.namespace,
_request_timeout=settings.AWX_CONTAINER_GROUP_K8S_API_TIMEOUT)
pod = self.kube_api.read_namespaced_pod(name=self.pod_name, namespace=self.namespace, _request_timeout=settings.AWX_CONTAINER_GROUP_K8S_API_TIMEOUT)
if pod.status.phase != 'Pending':
break
else:
@@ -64,16 +58,10 @@ class PodManager(object):
@classmethod
def list_active_jobs(self, instance_group):
task = collections.namedtuple('Task', 'id instance_group')(
id='',
instance_group=instance_group
)
task = collections.namedtuple('Task', 'id instance_group')(id='', instance_group=instance_group)
pm = PodManager(task)
try:
for pod in pm.kube_api.list_namespaced_pod(
pm.namespace,
label_selector='ansible-awx={}'.format(settings.INSTALL_UUID)
).to_dict().get('items', []):
for pod in pm.kube_api.list_namespaced_pod(pm.namespace, label_selector='ansible-awx={}'.format(settings.INSTALL_UUID)).to_dict().get('items', []):
job = pod['metadata'].get('labels', {}).get('ansible-awx-job-id')
if job:
try:
@@ -84,9 +72,7 @@ class PodManager(object):
logger.exception('Failed to list pods for container group {}'.format(instance_group))
def delete(self):
return self.kube_api.delete_namespaced_pod(name=self.pod_name,
namespace=self.namespace,
_request_timeout=settings.AWX_CONTAINER_GROUP_K8S_API_TIMEOUT)
return self.kube_api.delete_namespaced_pod(name=self.pod_name, namespace=self.namespace, _request_timeout=settings.AWX_CONTAINER_GROUP_K8S_API_TIMEOUT)
@property
def namespace(self):
@@ -105,14 +91,10 @@ class PodManager(object):
# this feels a little janky, but it's what k8s' own code does
# internally when it reads kube config files from disk:
# https://github.com/kubernetes-client/python-base/blob/0b208334ef0247aad9afcaae8003954423b61a0d/config/kube_config.py#L643
loader = config.kube_config.KubeConfigLoader(
config_dict=self.kube_config
)
loader = config.kube_config.KubeConfigLoader(config_dict=self.kube_config)
cfg = type.__call__(client.Configuration)
loader.load_and_set(cfg)
return client.CoreV1Api(api_client=client.ApiClient(
configuration=cfg
))
return client.CoreV1Api(api_client=client.ApiClient(configuration=cfg))
@property
def pod_name(self):
@@ -123,36 +105,29 @@ class PodManager(object):
default_pod_spec = {
"apiVersion": "v1",
"kind": "Pod",
"metadata": {
"namespace": settings.AWX_CONTAINER_GROUP_DEFAULT_NAMESPACE
},
"metadata": {"namespace": settings.AWX_CONTAINER_GROUP_DEFAULT_NAMESPACE},
"spec": {
"containers": [{
"image": settings.AWX_CONTAINER_GROUP_DEFAULT_IMAGE,
"tty": True,
"stdin": True,
"imagePullPolicy": "Always",
"args": [
'sleep', 'infinity'
]
}]
}
"containers": [
{
"image": settings.AWX_CONTAINER_GROUP_DEFAULT_IMAGE,
"tty": True,
"stdin": True,
"imagePullPolicy": "Always",
"args": ['sleep', 'infinity'],
}
]
},
}
pod_spec_override = {}
if self.task and self.task.instance_group.pod_spec_override:
pod_spec_override = parse_yaml_or_json(
self.task.instance_group.pod_spec_override)
pod_spec_override = parse_yaml_or_json(self.task.instance_group.pod_spec_override)
pod_spec = {**default_pod_spec, **pod_spec_override}
if self.task:
pod_spec['metadata'] = deepmerge(
pod_spec.get('metadata', {}),
dict(name=self.pod_name,
labels={
'ansible-awx': settings.INSTALL_UUID,
'ansible-awx-job-id': str(self.task.id)
}))
pod_spec.get('metadata', {}), dict(name=self.pod_name, labels={'ansible-awx': settings.INSTALL_UUID, 'ansible-awx-job-id': str(self.task.id)})
)
pod_spec['spec']['containers'][0]['name'] = self.pod_name
return pod_spec
@@ -164,39 +139,16 @@ def generate_tmp_kube_config(credential, namespace):
"apiVersion": "v1",
"kind": "Config",
"preferences": {},
"clusters": [
{
"name": host_input,
"cluster": {
"server": host_input
}
}
],
"users": [
{
"name": host_input,
"user": {
"token": credential.get_input('bearer_token')
}
}
],
"contexts": [
{
"name": host_input,
"context": {
"cluster": host_input,
"user": host_input,
"namespace": namespace
}
}
],
"current-context": host_input
"clusters": [{"name": host_input, "cluster": {"server": host_input}}],
"users": [{"name": host_input, "user": {"token": credential.get_input('bearer_token')}}],
"contexts": [{"name": host_input, "context": {"cluster": host_input, "user": host_input, "namespace": namespace}}],
"current-context": host_input,
}
if credential.get_input('verify_ssl') and 'ssl_ca_cert' in credential.inputs:
config["clusters"][0]["cluster"]["certificate-authority-data"] = b64encode(
credential.get_input('ssl_ca_cert').encode() # encode to bytes
).decode() # decode the base64 data into a str
credential.get_input('ssl_ca_cert').encode() # encode to bytes
).decode() # decode the base64 data into a str
else:
config["clusters"][0]["cluster"]["insecure-skip-tls-verify"] = True
return config

View File

@@ -31,7 +31,7 @@ from awx.main.models import (
UnifiedJob,
WorkflowApproval,
WorkflowJob,
WorkflowJobTemplate
WorkflowJobTemplate,
)
from awx.main.scheduler.dag_workflow import WorkflowDAG
from awx.main.utils.pglock import advisory_lock
@@ -44,10 +44,9 @@ from awx.main.utils import decrypt_field
logger = logging.getLogger('awx.main.scheduler')
class TaskManager():
class TaskManager:
def __init__(self):
'''
"""
Do NOT put database queries or other potentially expensive operations
in the task manager init. The task manager object is created every time a
job is created, transitions state, and every 30 seconds on each tower node.
@@ -55,7 +54,7 @@ class TaskManager():
The NOOP case is short-circuit logic. If the task manager realizes that another instance
of the task manager is already running, then it short-circuits and decides not to run.
'''
"""
self.graph = dict()
# start task limit indicates how many pending jobs can be started on this
# .schedule() run. Starting jobs is expensive, and there is code in place to reap
@@ -67,25 +66,27 @@ class TaskManager():
self.time_delta_job_explanation = timedelta(seconds=30)
def after_lock_init(self):
'''
"""
Init AFTER we know this instance of the task manager will run because the lock is acquired.
'''
"""
instances = Instance.objects.filter(~Q(hostname=None), enabled=True)
self.real_instances = {i.hostname: i for i in instances}
instances_partial = [SimpleNamespace(obj=instance,
remaining_capacity=instance.remaining_capacity,
capacity=instance.capacity,
jobs_running=instance.jobs_running,
hostname=instance.hostname) for instance in instances]
instances_partial = [
SimpleNamespace(
obj=instance,
remaining_capacity=instance.remaining_capacity,
capacity=instance.capacity,
jobs_running=instance.jobs_running,
hostname=instance.hostname,
)
for instance in instances
]
instances_by_hostname = {i.hostname: i for i in instances_partial}
for rampart_group in InstanceGroup.objects.prefetch_related('instances'):
self.graph[rampart_group.name] = dict(graph=DependencyGraph(),
capacity_total=rampart_group.capacity,
consumed_capacity=0,
instances=[])
self.graph[rampart_group.name] = dict(graph=DependencyGraph(), capacity_total=rampart_group.capacity, consumed_capacity=0, instances=[])
for instance in rampart_group.instances.filter(enabled=True).order_by('hostname'):
if instance.hostname in instances_by_hostname:
self.graph[rampart_group.name]['instances'].append(instances_by_hostname[instance.hostname])
@@ -108,21 +109,20 @@ class TaskManager():
def get_tasks(self, status_list=('pending', 'waiting', 'running')):
jobs = [j for j in Job.objects.filter(status__in=status_list).prefetch_related('instance_group')]
inventory_updates_qs = InventoryUpdate.objects.filter(
status__in=status_list).exclude(source='file').prefetch_related('inventory_source', 'instance_group')
inventory_updates_qs = (
InventoryUpdate.objects.filter(status__in=status_list).exclude(source='file').prefetch_related('inventory_source', 'instance_group')
)
inventory_updates = [i for i in inventory_updates_qs]
# Notice the job_type='check': we want to prevent implicit project updates from blocking our jobs.
project_updates = [p for p in ProjectUpdate.objects.filter(status__in=status_list, job_type='check').prefetch_related('instance_group')]
system_jobs = [s for s in SystemJob.objects.filter(status__in=status_list).prefetch_related('instance_group')]
ad_hoc_commands = [a for a in AdHocCommand.objects.filter(status__in=status_list).prefetch_related('instance_group')]
workflow_jobs = [w for w in WorkflowJob.objects.filter(status__in=status_list)]
all_tasks = sorted(jobs + project_updates + inventory_updates + system_jobs + ad_hoc_commands + workflow_jobs,
key=lambda task: task.created)
all_tasks = sorted(jobs + project_updates + inventory_updates + system_jobs + ad_hoc_commands + workflow_jobs, key=lambda task: task.created)
return all_tasks
def get_running_workflow_jobs(self):
graph_workflow_jobs = [wf for wf in
WorkflowJob.objects.filter(status='running')]
graph_workflow_jobs = [wf for wf in WorkflowJob.objects.filter(status='running')]
return graph_workflow_jobs
def get_inventory_source_tasks(self, all_sorted_tasks):
@@ -156,20 +156,26 @@ class TaskManager():
workflow_ancestors = job.get_ancestor_workflows()
if spawn_node.unified_job_template in set(workflow_ancestors):
can_start = False
logger.info('Refusing to start recursive workflow-in-workflow id={}, wfjt={}, ancestors={}'.format(
job.id, spawn_node.unified_job_template.pk, [wa.pk for wa in workflow_ancestors]))
logger.info(
'Refusing to start recursive workflow-in-workflow id={}, wfjt={}, ancestors={}'.format(
job.id, spawn_node.unified_job_template.pk, [wa.pk for wa in workflow_ancestors]
)
)
display_list = [spawn_node.unified_job_template] + workflow_ancestors
job.job_explanation = gettext_noop(
"Workflow Job spawned from workflow could not start because it "
"would result in recursion (spawn order, most recent first: {})"
"Workflow Job spawned from workflow could not start because it " "would result in recursion (spawn order, most recent first: {})"
).format(', '.join(['<{}>'.format(tmp) for tmp in display_list]))
else:
logger.debug('Starting workflow-in-workflow id={}, wfjt={}, ancestors={}'.format(
job.id, spawn_node.unified_job_template.pk, [wa.pk for wa in workflow_ancestors]))
logger.debug(
'Starting workflow-in-workflow id={}, wfjt={}, ancestors={}'.format(
job.id, spawn_node.unified_job_template.pk, [wa.pk for wa in workflow_ancestors]
)
)
if not job._resources_sufficient_for_launch():
can_start = False
job.job_explanation = gettext_noop("Job spawned from workflow could not start because it "
"was missing a related resource such as project or inventory")
job.job_explanation = gettext_noop(
"Job spawned from workflow could not start because it " "was missing a related resource such as project or inventory"
)
if can_start:
if workflow_job.start_args:
start_args = json.loads(decrypt_field(workflow_job, 'start_args'))
@@ -177,15 +183,16 @@ class TaskManager():
start_args = {}
can_start = job.signal_start(**start_args)
if not can_start:
job.job_explanation = gettext_noop("Job spawned from workflow could not start because it "
"was not in the right state or required manual credentials")
job.job_explanation = gettext_noop(
"Job spawned from workflow could not start because it " "was not in the right state or required manual credentials"
)
if not can_start:
job.status = 'failed'
job.save(update_fields=['status', 'job_explanation'])
job.websocket_emit_status('failed')
# TODO: should we emit a status on the socket here similar to tasks.py awx_periodic_scheduler() ?
#emit_websocket_notification('/socket.io/jobs', '', dict(id=))
# emit_websocket_notification('/socket.io/jobs', '', dict(id=))
def process_finished_workflow_jobs(self, workflow_jobs):
result = []
@@ -251,8 +258,7 @@ class TaskManager():
try:
controller_node = rampart_group.choose_online_controller_node()
except IndexError:
logger.debug("No controllers available in group {} to run {}".format(
rampart_group.name, task.log_format))
logger.debug("No controllers available in group {} to run {}".format(rampart_group.name, task.log_format))
return
task.status = 'waiting'
@@ -275,14 +281,12 @@ class TaskManager():
# non-Ansible jobs on isolated instances run on controller
task.instance_group = rampart_group.controller
task.execution_node = random.choice(list(rampart_group.controller.instances.all().values_list('hostname', flat=True)))
logger.debug('Submitting isolated {} to queue {} on node {}.'.format(
task.log_format, task.instance_group.name, task.execution_node))
logger.debug('Submitting isolated {} to queue {} on node {}.'.format(task.log_format, task.instance_group.name, task.execution_node))
elif controller_node:
task.instance_group = rampart_group
task.execution_node = instance.hostname
task.controller_node = controller_node
logger.debug('Submitting isolated {} to queue {} controlled by {}.'.format(
task.log_format, task.execution_node, controller_node))
logger.debug('Submitting isolated {} to queue {} controlled by {}.'.format(task.log_format, task.execution_node, controller_node))
elif rampart_group.is_container_group:
# find one real, non-containerized instance with capacity to
# act as the controller for k8s API interaction
@@ -295,9 +299,7 @@ class TaskManager():
break
task.instance_group = rampart_group
if match is None:
logger.warn(
'No available capacity to run containerized <{}>.'.format(task.log_format)
)
logger.warn('No available capacity to run containerized <{}>.'.format(task.log_format))
else:
if task.supports_isolation():
task.controller_node = match.hostname
@@ -306,14 +308,12 @@ class TaskManager():
# so just pick *any* non-isolated, non-containerized host and use it
# as the execution node
task.execution_node = match.hostname
logger.debug('Submitting containerized {} to queue {}.'.format(
task.log_format, task.execution_node))
logger.debug('Submitting containerized {} to queue {}.'.format(task.log_format, task.execution_node))
else:
task.instance_group = rampart_group
if instance is not None:
task.execution_node = instance.hostname
logger.debug('Submitting {} to <instance group, instance> <{},{}>.'.format(
task.log_format, task.instance_group_id, task.execution_node))
logger.debug('Submitting {} to <instance group, instance> <{},{}>.'.format(task.log_format, task.instance_group_id, task.execution_node))
with disable_activity_stream():
task.celery_task_id = str(uuid.uuid4())
task.save()
@@ -330,15 +330,8 @@ class TaskManager():
opts,
queue=task.get_queue_name(),
uuid=task.celery_task_id,
callbacks=[{
'task': handle_work_success.name,
'kwargs': {'task_actual': task_actual}
}],
errbacks=[{
'task': handle_work_error.name,
'args': [task.celery_task_id],
'kwargs': {'subtasks': [task_actual] + dependencies}
}],
callbacks=[{'task': handle_work_success.name, 'kwargs': {'task_actual': task_actual}}],
errbacks=[{'task': handle_work_error.name, 'args': [task.celery_task_id], 'kwargs': {'subtasks': [task_actual] + dependencies}}],
)
task.websocket_emit_status(task.status) # adds to on_commit
@@ -350,32 +343,22 @@ class TaskManager():
self.graph[task.instance_group.name]['graph'].add_job(task)
def create_project_update(self, task):
project_task = Project.objects.get(id=task.project_id).create_project_update(
_eager_fields=dict(launch_type='dependency'))
project_task = Project.objects.get(id=task.project_id).create_project_update(_eager_fields=dict(launch_type='dependency'))
# Project created 1 seconds behind
project_task.created = task.created - timedelta(seconds=1)
project_task.status = 'pending'
project_task.save()
logger.debug(
'Spawned {} as dependency of {}'.format(
project_task.log_format, task.log_format
)
)
logger.debug('Spawned {} as dependency of {}'.format(project_task.log_format, task.log_format))
return project_task
def create_inventory_update(self, task, inventory_source_task):
inventory_task = InventorySource.objects.get(id=inventory_source_task.id).create_inventory_update(
_eager_fields=dict(launch_type='dependency'))
inventory_task = InventorySource.objects.get(id=inventory_source_task.id).create_inventory_update(_eager_fields=dict(launch_type='dependency'))
inventory_task.created = task.created - timedelta(seconds=2)
inventory_task.status = 'pending'
inventory_task.save()
logger.debug(
'Spawned {} as dependency of {}'.format(
inventory_task.log_format, task.log_format
)
)
logger.debug('Spawned {} as dependency of {}'.format(inventory_task.log_format, task.log_format))
# inventory_sources = self.get_inventory_source_tasks([task])
# self.process_inventory_sources(inventory_sources)
return inventory_task
@@ -409,8 +392,7 @@ class TaskManager():
timeout_seconds = timedelta(seconds=latest_inventory_update.inventory_source.update_cache_timeout)
if (latest_inventory_update.finished + timeout_seconds) < now:
return True
if latest_inventory_update.inventory_source.update_on_launch is True and \
latest_inventory_update.status in ['failed', 'canceled', 'error']:
if latest_inventory_update.inventory_source.update_on_launch is True and latest_inventory_update.status in ['failed', 'canceled', 'error']:
return True
return False
@@ -441,9 +423,11 @@ class TaskManager():
then consider the project update found. This is so we don't enter an infinite loop
of updating the project when cache timeout is 0.
'''
if latest_project_update.project.scm_update_cache_timeout == 0 and \
latest_project_update.launch_type == 'dependency' and \
latest_project_update.created == job.created - timedelta(seconds=1):
if (
latest_project_update.project.scm_update_cache_timeout == 0
and latest_project_update.launch_type == 'dependency'
and latest_project_update.created == job.created - timedelta(seconds=1)
):
return False
'''
Normal Cache Timeout Logic
@@ -491,7 +475,7 @@ class TaskManager():
if len(dependencies) > 0:
self.capture_chain_failure_dependencies(task, dependencies)
UnifiedJob.objects.filter(pk__in = [task.pk for task in undeped_tasks]).update(dependencies_processed=True)
UnifiedJob.objects.filter(pk__in=[task.pk for task in undeped_tasks]).update(dependencies_processed=True)
return created_dependencies
def process_pending_tasks(self, pending_tasks):
@@ -506,7 +490,7 @@ class TaskManager():
job_explanation = gettext_noop(f"waiting for {blocked_by._meta.model_name}-{blocked_by.id} to finish")
if task.job_explanation != job_explanation:
if task.created < (tz_now() - self.time_delta_job_explanation):
task.job_explanation = job_explanation
task.job_explanation = job_explanation
tasks_to_update_job_explanation.append(task)
continue
preferred_instance_groups = task.preferred_instance_groups
@@ -529,22 +513,26 @@ class TaskManager():
remaining_capacity = self.get_remaining_capacity(rampart_group.name)
if (
task.task_impact > 0 and # project updates have a cost of zero
not rampart_group.is_container_group and
self.get_remaining_capacity(rampart_group.name) <= 0):
logger.debug("Skipping group {}, remaining_capacity {} <= 0".format(
rampart_group.name, remaining_capacity))
task.task_impact > 0
and not rampart_group.is_container_group # project updates have a cost of zero
and self.get_remaining_capacity(rampart_group.name) <= 0
):
logger.debug("Skipping group {}, remaining_capacity {} <= 0".format(rampart_group.name, remaining_capacity))
continue
execution_instance = InstanceGroup.fit_task_to_most_remaining_capacity_instance(task, self.graph[rampart_group.name]['instances']) or \
InstanceGroup.find_largest_idle_instance(self.graph[rampart_group.name]['instances'])
execution_instance = InstanceGroup.fit_task_to_most_remaining_capacity_instance(
task, self.graph[rampart_group.name]['instances']
) or InstanceGroup.find_largest_idle_instance(self.graph[rampart_group.name]['instances'])
if execution_instance or rampart_group.is_container_group:
if not rampart_group.is_container_group:
execution_instance.remaining_capacity = max(0, execution_instance.remaining_capacity - task.task_impact)
execution_instance.jobs_running += 1
logger.debug("Starting {} in group {} instance {} (remaining_capacity={})".format(
task.log_format, rampart_group.name, execution_instance.hostname, remaining_capacity))
logger.debug(
"Starting {} in group {} instance {} (remaining_capacity={})".format(
task.log_format, rampart_group.name, execution_instance.hostname, remaining_capacity
)
)
if execution_instance:
execution_instance = self.real_instances[execution_instance.hostname]
@@ -553,8 +541,11 @@ class TaskManager():
found_acceptable_queue = True
break
else:
logger.debug("No instance available in group {} to run job {} w/ capacity requirement {}".format(
rampart_group.name, task.log_format, task.task_impact))
logger.debug(
"No instance available in group {} to run job {} w/ capacity requirement {}".format(
rampart_group.name, task.log_format, task.task_impact
)
)
if not found_acceptable_queue:
task.log_lifecycle("needs_capacity")
job_explanation = gettext_noop("This job is not ready to start because there is not enough available capacity.")
@@ -576,9 +567,9 @@ class TaskManager():
if task.timeout == 0:
continue
if (now - task.created) >= approval_timeout_seconds:
timeout_message = _(
"The approval node {name} ({pk}) has expired after {timeout} seconds."
).format(name=task.name, pk=task.pk, timeout=task.timeout)
timeout_message = _("The approval node {name} ({pk}) has expired after {timeout} seconds.").format(
name=task.name, pk=task.pk, timeout=task.timeout
)
logger.warn(timeout_message)
task.timed_out = True
task.status = 'failed'
@@ -594,9 +585,7 @@ class TaskManager():
# elsewhere
for j in UnifiedJob.objects.filter(
status__in=['pending', 'waiting', 'running'],
).exclude(
execution_node__in=Instance.objects.values_list('hostname', flat=True)
):
).exclude(execution_node__in=Instance.objects.values_list('hostname', flat=True)):
if j.execution_node and not j.is_container_group_task:
logger.error(f'{j.execution_node} is not a registered instance; reaping {j.log_format}')
reap_job(j, 'failed')
@@ -605,13 +594,15 @@ class TaskManager():
self.graph = InstanceGroup.objects.capacity_values(tasks=tasks, graph=self.graph)
def consume_capacity(self, task, instance_group):
logger.debug('{} consumed {} capacity units from {} with prior total of {}'.format(
task.log_format, task.task_impact, instance_group,
self.graph[instance_group]['consumed_capacity']))
logger.debug(
'{} consumed {} capacity units from {} with prior total of {}'.format(
task.log_format, task.task_impact, instance_group, self.graph[instance_group]['consumed_capacity']
)
)
self.graph[instance_group]['consumed_capacity'] += task.task_impact
def get_remaining_capacity(self, instance_group):
return (self.graph[instance_group]['capacity_total'] - self.graph[instance_group]['consumed_capacity'])
return self.graph[instance_group]['capacity_total'] - self.graph[instance_group]['consumed_capacity']
def process_tasks(self, all_sorted_tasks):
running_tasks = [t for t in all_sorted_tasks if t.status in ['waiting', 'running']]

View File

@@ -1,4 +1,3 @@
# Python
import logging