mirror of
https://github.com/ansible/awx.git
synced 2026-02-05 03:24:50 -03:30
Compare commits
7 Commits
upgrade-sq
...
devel_bug_
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
98dc60f9d6 | ||
|
|
29addb6ad0 | ||
|
|
c2f8acebb1 | ||
|
|
65d309f44a | ||
|
|
8bcc65fe80 | ||
|
|
db6e8b9bad | ||
|
|
483417762f |
2
.gitignore
vendored
2
.gitignore
vendored
@@ -150,6 +150,8 @@ use_dev_supervisor.txt
|
||||
|
||||
awx/ui/src
|
||||
awx/ui/build
|
||||
awx/ui/.ui-built
|
||||
awx/ui_next
|
||||
|
||||
# Docs build stuff
|
||||
docs/docsite/build/
|
||||
|
||||
@@ -7,6 +7,7 @@ import time
|
||||
import traceback
|
||||
from datetime import datetime
|
||||
from uuid import uuid4
|
||||
import json
|
||||
|
||||
import collections
|
||||
from multiprocessing import Process
|
||||
@@ -25,7 +26,10 @@ from ansible_base.lib.logging.runtime import log_excess_runtime
|
||||
|
||||
from awx.main.models import UnifiedJob
|
||||
from awx.main.dispatch import reaper
|
||||
from awx.main.utils.common import convert_mem_str_to_bytes, get_mem_effective_capacity
|
||||
from awx.main.utils.common import get_mem_effective_capacity, get_corrected_memory, get_corrected_cpu, get_cpu_effective_capacity
|
||||
|
||||
# ansible-runner
|
||||
from ansible_runner.utils.capacity import get_mem_in_bytes, get_cpu_count
|
||||
|
||||
if 'run_callback_receiver' in sys.argv:
|
||||
logger = logging.getLogger('awx.main.commands.run_callback_receiver')
|
||||
@@ -307,6 +311,41 @@ class WorkerPool(object):
|
||||
logger.exception('could not kill {}'.format(worker.pid))
|
||||
|
||||
|
||||
def get_auto_max_workers():
|
||||
"""Method we normally rely on to get max_workers
|
||||
|
||||
Uses almost same logic as Instance.local_health_check
|
||||
The important thing is to be MORE than Instance.capacity
|
||||
so that the task-manager does not over-schedule this node
|
||||
|
||||
Ideally we would just use the capacity from the database plus reserve workers,
|
||||
but this poses some bootstrap problems where OCP task containers
|
||||
register themselves after startup
|
||||
"""
|
||||
# Get memory from ansible-runner
|
||||
total_memory_gb = get_mem_in_bytes()
|
||||
|
||||
# This may replace memory calculation with a user override
|
||||
corrected_memory = get_corrected_memory(total_memory_gb)
|
||||
|
||||
# Get same number as max forks based on memory, this function takes memory as bytes
|
||||
mem_capacity = get_mem_effective_capacity(corrected_memory, is_control_node=True)
|
||||
|
||||
# Follow same process for CPU capacity constraint
|
||||
cpu_count = get_cpu_count()
|
||||
corrected_cpu = get_corrected_cpu(cpu_count)
|
||||
cpu_capacity = get_cpu_effective_capacity(corrected_cpu, is_control_node=True)
|
||||
|
||||
# Here is what is different from health checks,
|
||||
auto_max = max(mem_capacity, cpu_capacity)
|
||||
|
||||
# add magic number of extra workers to ensure
|
||||
# we have a few extra workers to run the heartbeat
|
||||
auto_max += 7
|
||||
|
||||
return auto_max
|
||||
|
||||
|
||||
class AutoscalePool(WorkerPool):
|
||||
"""
|
||||
An extended pool implementation that automatically scales workers up and
|
||||
@@ -320,19 +359,7 @@ class AutoscalePool(WorkerPool):
|
||||
super(AutoscalePool, self).__init__(*args, **kwargs)
|
||||
|
||||
if self.max_workers is None:
|
||||
settings_absmem = getattr(settings, 'SYSTEM_TASK_ABS_MEM', None)
|
||||
if settings_absmem is not None:
|
||||
# There are 1073741824 bytes in a gigabyte. Convert bytes to gigabytes by dividing by 2**30
|
||||
total_memory_gb = convert_mem_str_to_bytes(settings_absmem) // 2**30
|
||||
else:
|
||||
total_memory_gb = (psutil.virtual_memory().total >> 30) + 1 # noqa: round up
|
||||
|
||||
# Get same number as max forks based on memory, this function takes memory as bytes
|
||||
self.max_workers = get_mem_effective_capacity(total_memory_gb * 2**30)
|
||||
|
||||
# add magic prime number of extra workers to ensure
|
||||
# we have a few extra workers to run the heartbeat
|
||||
self.max_workers += 7
|
||||
self.max_workers = get_auto_max_workers()
|
||||
|
||||
# max workers can't be less than min_workers
|
||||
self.max_workers = max(self.min_workers, self.max_workers)
|
||||
@@ -346,6 +373,9 @@ class AutoscalePool(WorkerPool):
|
||||
self.scale_up_ct = 0
|
||||
self.worker_count_max = 0
|
||||
|
||||
# last time we wrote current tasks, to avoid too much log spam
|
||||
self.last_task_list_log = time.monotonic()
|
||||
|
||||
def produce_subsystem_metrics(self, metrics_object):
|
||||
metrics_object.set('dispatcher_pool_scale_up_events', self.scale_up_ct)
|
||||
metrics_object.set('dispatcher_pool_active_task_count', sum(len(w.managed_tasks) for w in self.workers))
|
||||
@@ -463,6 +493,14 @@ class AutoscalePool(WorkerPool):
|
||||
self.worker_count_max = new_worker_ct
|
||||
return ret
|
||||
|
||||
@staticmethod
|
||||
def fast_task_serialization(current_task):
|
||||
try:
|
||||
return str(current_task.get('task')) + ' - ' + str(sorted(current_task.get('args', []))) + ' - ' + str(sorted(current_task.get('kwargs', {})))
|
||||
except Exception:
|
||||
# just make sure this does not make things worse
|
||||
return str(current_task)
|
||||
|
||||
def write(self, preferred_queue, body):
|
||||
if 'guid' in body:
|
||||
set_guid(body['guid'])
|
||||
@@ -484,6 +522,15 @@ class AutoscalePool(WorkerPool):
|
||||
if isinstance(body, dict):
|
||||
task_name = body.get('task')
|
||||
logger.warning(f'Workers maxed, queuing {task_name}, load: {sum(len(w.managed_tasks) for w in self.workers)} / {len(self.workers)}')
|
||||
# Once every 10 seconds write out task list for debugging
|
||||
if time.monotonic() - self.last_task_list_log >= 10.0:
|
||||
task_counts = {}
|
||||
for worker in self.workers:
|
||||
task_slug = self.fast_task_serialization(worker.current_task)
|
||||
task_counts.setdefault(task_slug, 0)
|
||||
task_counts[task_slug] += 1
|
||||
logger.info(f'Running tasks by count:\n{json.dumps(task_counts, indent=2)}')
|
||||
self.last_task_list_log = time.monotonic()
|
||||
return super(AutoscalePool, self).write(preferred_queue, body)
|
||||
except Exception:
|
||||
for conn in connections.all():
|
||||
|
||||
@@ -238,7 +238,7 @@ class AWXConsumerPG(AWXConsumerBase):
|
||||
def run(self, *args, **kwargs):
|
||||
super(AWXConsumerPG, self).run(*args, **kwargs)
|
||||
|
||||
logger.info(f"Running worker {self.name} listening to queues {self.queues}")
|
||||
logger.info(f"Running {self.name}, workers min={self.pool.min_workers} max={self.pool.max_workers}, listening to queues {self.queues}")
|
||||
init = False
|
||||
|
||||
while True:
|
||||
|
||||
@@ -602,7 +602,7 @@ class JobEvent(BasePlaybookEvent):
|
||||
h.last_job_host_summary_id = host_mapping[h.id]
|
||||
updated_hosts.add(h)
|
||||
|
||||
Host.objects.bulk_update(list(updated_hosts), ['last_job_id', 'last_job_host_summary_id'], batch_size=100)
|
||||
Host.objects.bulk_update(sorted(updated_hosts, key=lambda host: host.id), ['last_job_id', 'last_job_host_summary_id'], batch_size=100)
|
||||
|
||||
# Create/update Host Metrics
|
||||
self._update_host_metrics(updated_hosts_list)
|
||||
|
||||
@@ -62,7 +62,8 @@ def start_fact_cache(hosts, destination, log_data, timeout=None, inventory_id=No
|
||||
|
||||
|
||||
def raw_update_hosts(host_list):
|
||||
Host.objects.bulk_update(host_list, ['ansible_facts', 'ansible_facts_modified'])
|
||||
host_list = sorted(host_list, key=lambda host: host.id)
|
||||
Host.objects.bulk_update(host_list, ['ansible_facts', 'ansible_facts_modified'], batch_size=100)
|
||||
|
||||
|
||||
def update_hosts(host_list, max_tries=5):
|
||||
|
||||
17
awx/main/tests/data/sleep_task.py
Normal file
17
awx/main/tests/data/sleep_task.py
Normal file
@@ -0,0 +1,17 @@
|
||||
import time
|
||||
import logging
|
||||
|
||||
from awx.main.dispatch import get_task_queuename
|
||||
from awx.main.dispatch.publish import task
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@task(queue=get_task_queuename)
|
||||
def sleep_task(seconds=10, log=False):
|
||||
if log:
|
||||
logger.info('starting sleep_task')
|
||||
time.sleep(seconds)
|
||||
if log:
|
||||
logger.info('finished sleep_task')
|
||||
@@ -34,40 +34,18 @@ def test_wrapup_does_send_notifications(mocker):
|
||||
mock.assert_called_once_with('succeeded')
|
||||
|
||||
|
||||
class FakeRedis:
|
||||
def keys(self, *args, **kwargs):
|
||||
return []
|
||||
|
||||
def set(self):
|
||||
pass
|
||||
|
||||
def get(self):
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def from_url(cls, *args, **kwargs):
|
||||
return cls()
|
||||
|
||||
def pipeline(self):
|
||||
return self
|
||||
|
||||
|
||||
class TestCallbackBrokerWorker(TransactionTestCase):
|
||||
@pytest.fixture(autouse=True)
|
||||
def turn_off_websockets(self):
|
||||
def turn_off_websockets_and_redis(self, fake_redis):
|
||||
with mock.patch('awx.main.dispatch.worker.callback.emit_event_detail', lambda *a, **kw: None):
|
||||
yield
|
||||
|
||||
def get_worker(self):
|
||||
with mock.patch('redis.Redis', new=FakeRedis): # turn off redis stuff
|
||||
return CallbackBrokerWorker()
|
||||
|
||||
def event_create_kwargs(self):
|
||||
inventory_update = InventoryUpdate.objects.create(source='file', inventory_source=InventorySource.objects.create(source='file'))
|
||||
return dict(inventory_update=inventory_update, created=inventory_update.created)
|
||||
|
||||
def test_flush_with_valid_event(self):
|
||||
worker = self.get_worker()
|
||||
worker = CallbackBrokerWorker()
|
||||
events = [InventoryUpdateEvent(uuid=str(uuid4()), **self.event_create_kwargs())]
|
||||
worker.buff = {InventoryUpdateEvent: events}
|
||||
worker.flush()
|
||||
@@ -75,7 +53,7 @@ class TestCallbackBrokerWorker(TransactionTestCase):
|
||||
assert InventoryUpdateEvent.objects.filter(uuid=events[0].uuid).count() == 1
|
||||
|
||||
def test_flush_with_invalid_event(self):
|
||||
worker = self.get_worker()
|
||||
worker = CallbackBrokerWorker()
|
||||
kwargs = self.event_create_kwargs()
|
||||
events = [
|
||||
InventoryUpdateEvent(uuid=str(uuid4()), stdout='good1', **kwargs),
|
||||
@@ -90,7 +68,7 @@ class TestCallbackBrokerWorker(TransactionTestCase):
|
||||
assert worker.buff == {InventoryUpdateEvent: [events[1]]}
|
||||
|
||||
def test_duplicate_key_not_saved_twice(self):
|
||||
worker = self.get_worker()
|
||||
worker = CallbackBrokerWorker()
|
||||
events = [InventoryUpdateEvent(uuid=str(uuid4()), **self.event_create_kwargs())]
|
||||
worker.buff = {InventoryUpdateEvent: events.copy()}
|
||||
worker.flush()
|
||||
@@ -104,7 +82,7 @@ class TestCallbackBrokerWorker(TransactionTestCase):
|
||||
assert worker.buff.get(InventoryUpdateEvent, []) == []
|
||||
|
||||
def test_give_up_on_bad_event(self):
|
||||
worker = self.get_worker()
|
||||
worker = CallbackBrokerWorker()
|
||||
events = [InventoryUpdateEvent(uuid=str(uuid4()), counter=-2, **self.event_create_kwargs())]
|
||||
worker.buff = {InventoryUpdateEvent: events.copy()}
|
||||
|
||||
@@ -117,7 +95,7 @@ class TestCallbackBrokerWorker(TransactionTestCase):
|
||||
assert InventoryUpdateEvent.objects.filter(uuid=events[0].uuid).count() == 0 # sanity
|
||||
|
||||
def test_flush_with_empty_buffer(self):
|
||||
worker = self.get_worker()
|
||||
worker = CallbackBrokerWorker()
|
||||
worker.buff = {InventoryUpdateEvent: []}
|
||||
with mock.patch.object(InventoryUpdateEvent.objects, 'bulk_create') as flush_mock:
|
||||
worker.flush()
|
||||
@@ -127,7 +105,7 @@ class TestCallbackBrokerWorker(TransactionTestCase):
|
||||
# In postgres, text fields reject NUL character, 0x00
|
||||
# tests use sqlite3 which will not raise an error
|
||||
# but we can still test that it is sanitized before saving
|
||||
worker = self.get_worker()
|
||||
worker = CallbackBrokerWorker()
|
||||
kwargs = self.event_create_kwargs()
|
||||
events = [InventoryUpdateEvent(uuid=str(uuid4()), stdout="\x00", **kwargs)]
|
||||
assert "\x00" in events[0].stdout # sanity
|
||||
|
||||
@@ -63,6 +63,33 @@ def swagger_autogen(requests=__SWAGGER_REQUESTS__):
|
||||
return requests
|
||||
|
||||
|
||||
class FakeRedis:
|
||||
def keys(self, *args, **kwargs):
|
||||
return []
|
||||
|
||||
def set(self):
|
||||
pass
|
||||
|
||||
def get(self):
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def from_url(cls, *args, **kwargs):
|
||||
return cls()
|
||||
|
||||
def pipeline(self):
|
||||
return self
|
||||
|
||||
def ping(self):
|
||||
return
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def fake_redis():
|
||||
with mock.patch('redis.Redis', new=FakeRedis): # turn off redis stuff
|
||||
yield
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def user():
|
||||
def u(name, is_superuser=False):
|
||||
|
||||
@@ -3,6 +3,10 @@ import pytest
|
||||
# AWX
|
||||
from awx.main.ha import is_ha_environment
|
||||
from awx.main.models.ha import Instance
|
||||
from awx.main.dispatch.pool import get_auto_max_workers
|
||||
|
||||
# Django
|
||||
from django.test.utils import override_settings
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
@@ -17,3 +21,25 @@ def test_db_localhost():
|
||||
Instance.objects.create(hostname='foo', node_type='hybrid')
|
||||
Instance.objects.create(hostname='bar', node_type='execution')
|
||||
assert is_ha_environment() is False
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
@pytest.mark.parametrize(
|
||||
'settings',
|
||||
[
|
||||
dict(SYSTEM_TASK_ABS_MEM='16Gi', SYSTEM_TASK_ABS_CPU='24', SYSTEM_TASK_FORKS_MEM=400, SYSTEM_TASK_FORKS_CPU=4),
|
||||
dict(SYSTEM_TASK_ABS_MEM='124Gi', SYSTEM_TASK_ABS_CPU='2', SYSTEM_TASK_FORKS_MEM=None, SYSTEM_TASK_FORKS_CPU=None),
|
||||
],
|
||||
ids=['cpu_dominated', 'memory_dominated'],
|
||||
)
|
||||
def test_dispatcher_max_workers_reserve(settings, fake_redis):
|
||||
"""This tests that the dispatcher max_workers matches instance capacity
|
||||
|
||||
Assumes capacity_adjustment is 1,
|
||||
plus reserve worker count
|
||||
"""
|
||||
with override_settings(**settings):
|
||||
i = Instance.objects.create(hostname='test-1', node_type='hybrid')
|
||||
i.local_health_check()
|
||||
|
||||
assert get_auto_max_workers() == i.capacity + 7, (i.cpu, i.memory, i.cpu_capacity, i.mem_capacity)
|
||||
|
||||
79
awx/main/tests/live/tests/test_host_update_contention.py
Normal file
79
awx/main/tests/live/tests/test_host_update_contention.py
Normal file
@@ -0,0 +1,79 @@
|
||||
import multiprocessing
|
||||
import random
|
||||
|
||||
from django.db import connection
|
||||
from django.utils.timezone import now
|
||||
|
||||
from awx.main.models import Inventory, Host
|
||||
|
||||
|
||||
def worker_delete_target(ready_event, continue_event, field_name):
|
||||
"""Runs the bulk update, will be called in duplicate, in parallel"""
|
||||
inv = Inventory.objects.get(organization__name='Default', name='test_host_update_contention')
|
||||
host_list = list(inv.hosts.all())
|
||||
random.shuffle(host_list)
|
||||
for i, host in enumerate(host_list):
|
||||
setattr(host, field_name, f'my_var: {i}')
|
||||
|
||||
# ready to do the bulk_update
|
||||
print('worker has loaded all the hosts needed')
|
||||
ready_event.set()
|
||||
# wait for the coordination message
|
||||
continue_event.wait()
|
||||
|
||||
# # presumed fix
|
||||
# host_list = sorted(host_list, key=lambda host: host.id)
|
||||
|
||||
# NOTE: did not reproduce the bug without batch_size
|
||||
Host.objects.bulk_update(host_list, [field_name], batch_size=100)
|
||||
print('finished doing the bulk update in worker')
|
||||
|
||||
|
||||
def test_host_update_contention(default_org):
|
||||
inv_kwargs = dict(organization=default_org, name='test_host_update_contention')
|
||||
|
||||
if Inventory.objects.filter(**inv_kwargs).exists():
|
||||
inv = Inventory.objects.get(**inv_kwargs).delete()
|
||||
|
||||
inv = Inventory.objects.create(**inv_kwargs)
|
||||
right_now = now()
|
||||
hosts = [Host(inventory=inv, name=f'host-{i}', created=right_now, modified=right_now) for i in range(1000)]
|
||||
print('bulk creating hosts')
|
||||
Host.objects.bulk_create(hosts)
|
||||
|
||||
# sanity check
|
||||
for host in hosts:
|
||||
assert not host.variables
|
||||
|
||||
# Force our worker pool to make their own connection
|
||||
connection.close()
|
||||
|
||||
ready_events = [multiprocessing.Event() for _ in range(2)]
|
||||
continue_event = multiprocessing.Event()
|
||||
|
||||
print('spawning processes for concurrent bulk updates')
|
||||
processes = []
|
||||
fields = ['variables', 'ansible_facts']
|
||||
for i in range(2):
|
||||
p = multiprocessing.Process(target=worker_delete_target, args=(ready_events[i], continue_event, fields[i]))
|
||||
processes.append(p)
|
||||
p.start()
|
||||
|
||||
# Assure both processes are connected and have loaded their host list
|
||||
for e in ready_events:
|
||||
print('waiting on subprocess ready event')
|
||||
e.wait()
|
||||
|
||||
# Begin the bulk_update queries
|
||||
print('setting the continue event for the workers')
|
||||
continue_event.set()
|
||||
|
||||
# if a Deadloack happens it will probably be surfaced by result here
|
||||
print('waiting on the workers to finish the bulk_update')
|
||||
for p in processes:
|
||||
p.join()
|
||||
|
||||
print('checking workers have variables set')
|
||||
for host in inv.hosts.all():
|
||||
assert host.variables.startswith('my_var:')
|
||||
assert host.ansible_facts.startswith('my_var:')
|
||||
@@ -103,7 +103,7 @@ def test_finish_job_fact_cache_with_existing_data(hosts, mocker, tmpdir, ref_tim
|
||||
assert host.ansible_facts_modified == ref_time
|
||||
assert hosts[1].ansible_facts == ansible_facts_new
|
||||
assert hosts[1].ansible_facts_modified > ref_time
|
||||
bulk_update.assert_called_once_with([hosts[1]], ['ansible_facts', 'ansible_facts_modified'])
|
||||
bulk_update.assert_called_once_with([hosts[1]], ['ansible_facts', 'ansible_facts_modified'], batch_size=100)
|
||||
|
||||
|
||||
def test_finish_job_fact_cache_with_bad_data(hosts, mocker, tmpdir):
|
||||
@@ -139,4 +139,4 @@ def test_finish_job_fact_cache_clear(hosts, mocker, ref_time, tmpdir):
|
||||
assert host.ansible_facts_modified == ref_time
|
||||
assert hosts[1].ansible_facts == {}
|
||||
assert hosts[1].ansible_facts_modified > ref_time
|
||||
bulk_update.assert_called_once_with([hosts[1]], ['ansible_facts', 'ansible_facts_modified'])
|
||||
bulk_update.assert_called_once_with([hosts[1]], ['ansible_facts', 'ansible_facts_modified'], batch_size=100)
|
||||
|
||||
16
tools/scripts/firehose_tasks.py
Executable file
16
tools/scripts/firehose_tasks.py
Executable file
@@ -0,0 +1,16 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
from django import setup
|
||||
|
||||
from awx import prepare_env
|
||||
|
||||
prepare_env()
|
||||
|
||||
setup()
|
||||
|
||||
# Keeping this in test folder allows it to be importable
|
||||
from awx.main.tests.data.sleep_task import sleep_task
|
||||
|
||||
|
||||
for i in range(634):
|
||||
sleep_task.delay()
|
||||
Reference in New Issue
Block a user