mirror of
https://github.com/ansible/awx.git
synced 2026-05-19 14:57:39 -02:30
only reap non-netsplit nodes
This commit is contained in:
@@ -69,14 +69,43 @@ class TaskManager():
|
||||
|
||||
'''
|
||||
Tasks that are running and SHOULD have a celery task.
|
||||
{
|
||||
'execution_node': [j1, j2,...],
|
||||
'execution_node': [j3],
|
||||
...
|
||||
}
|
||||
'''
|
||||
def get_running_tasks(self):
|
||||
execution_nodes = {}
|
||||
now = tz_now()
|
||||
return list(UnifiedJob.objects.filter(Q(status='running') |
|
||||
jobs = list(UnifiedJob.objects.filter(Q(status='running') |
|
||||
(Q(status='waiting', modified__lte=now - timedelta(seconds=60)))))
|
||||
for j in jobs:
|
||||
if j.execution_node in execution_nodes:
|
||||
execution_nodes[j.execution_node].append(j)
|
||||
elif j.execution_node not in execution_nodes:
|
||||
execution_nodes[j.execution_node] = [j]
|
||||
return execution_nodes
|
||||
|
||||
'''
|
||||
Tasks that are currently running in celery
|
||||
|
||||
Transform:
|
||||
{
|
||||
"celery@ec2-54-204-222-62.compute-1.amazonaws.com": [],
|
||||
"celery@ec2-54-163-144-168.compute-1.amazonaws.com": [{
|
||||
...
|
||||
"id": "5238466a-f8c7-43b3-9180-5b78e9da8304",
|
||||
...
|
||||
}]
|
||||
}
|
||||
|
||||
to:
|
||||
{
|
||||
"celery@ec2-54-204-222-62.compute-1.amazonaws.com": [
|
||||
"5238466a-f8c7-43b3-9180-5b78e9da8304",
|
||||
]
|
||||
}
|
||||
'''
|
||||
def get_active_tasks(self):
|
||||
inspector = inspect()
|
||||
@@ -86,15 +115,26 @@ class TaskManager():
|
||||
logger.warn("Ignoring celery task inspector")
|
||||
active_task_queues = None
|
||||
|
||||
active_tasks = set()
|
||||
queues = None
|
||||
|
||||
if active_task_queues is not None:
|
||||
queues = {}
|
||||
for queue in active_task_queues:
|
||||
active_tasks = set()
|
||||
map(lambda at: active_tasks.add(at['id']), active_task_queues[queue])
|
||||
|
||||
# queue is of the form celery@myhost.com
|
||||
queue_name = queue.split('@')
|
||||
if len(queue_name) > 1:
|
||||
queue_name = queue_name[1]
|
||||
else:
|
||||
queue_name = queue_name[0]
|
||||
queues[queue_name] = active_tasks
|
||||
else:
|
||||
if not hasattr(settings, 'CELERY_UNIT_TEST'):
|
||||
return (None, None)
|
||||
|
||||
return (active_task_queues, active_tasks)
|
||||
return (active_task_queues, queues)
|
||||
|
||||
def get_latest_project_update_tasks(self, all_sorted_tasks):
|
||||
project_ids = Set()
|
||||
@@ -380,32 +420,38 @@ class TaskManager():
|
||||
|
||||
logger.debug("Failing inconsistent running jobs.")
|
||||
celery_task_start_time = tz_now()
|
||||
active_task_queues, active_tasks = self.get_active_tasks()
|
||||
active_task_queues, active_queues = self.get_active_tasks()
|
||||
cache.set('last_celery_task_cleanup', tz_now())
|
||||
|
||||
if active_tasks is None:
|
||||
if active_queues is None:
|
||||
logger.error('Failed to retrieve active tasks from celery')
|
||||
return None
|
||||
|
||||
all_running_sorted_tasks = self.get_running_tasks()
|
||||
for task in all_running_sorted_tasks:
|
||||
|
||||
if (task.celery_task_id not in active_tasks and not hasattr(settings, 'IGNORE_CELERY_INSPECTOR')):
|
||||
# TODO: try catch the getting of the job. The job COULD have been deleted
|
||||
if isinstance(task, WorkflowJob):
|
||||
continue
|
||||
if task.modified > celery_task_start_time:
|
||||
continue
|
||||
task.status = 'failed'
|
||||
task.job_explanation += ' '.join((
|
||||
'Task was marked as running in Tower but was not present in',
|
||||
'Celery, so it has been marked as failed.',
|
||||
))
|
||||
task.save()
|
||||
awx_tasks._send_notification_templates(task, 'failed')
|
||||
task.websocket_emit_status('failed')
|
||||
logger.error("%s appears orphaned... marking as failed", task.log_format)
|
||||
|
||||
'''
|
||||
Only consider failing tasks on instances for which we obtained a task
|
||||
list from celery for.
|
||||
'''
|
||||
execution_nodes_jobs = self.get_running_tasks()
|
||||
for node, node_jobs in execution_nodes_jobs.iteritems():
|
||||
if node not in active_queues:
|
||||
continue
|
||||
active_tasks = active_queues[node]
|
||||
for task in node_jobs:
|
||||
if (task.celery_task_id not in active_tasks and not hasattr(settings, 'IGNORE_CELERY_INSPECTOR')):
|
||||
# TODO: try catch the getting of the job. The job COULD have been deleted
|
||||
if isinstance(task, WorkflowJob):
|
||||
continue
|
||||
if task.modified > celery_task_start_time:
|
||||
continue
|
||||
task.status = 'failed'
|
||||
task.job_explanation += ' '.join((
|
||||
'Task was marked as running in Tower but was not present in',
|
||||
'Celery, so it has been marked as failed.',
|
||||
))
|
||||
task.save()
|
||||
awx_tasks._send_notification_templates(task, 'failed')
|
||||
task.websocket_emit_status('failed')
|
||||
logger.error("Task %s appears orphaned... marking as failed" % task)
|
||||
|
||||
def calculate_capacity_used(self, tasks):
|
||||
for rampart_group in self.graph:
|
||||
|
||||
Reference in New Issue
Block a user