mirror of
https://github.com/ansible/awx.git
synced 2026-03-03 09:48:51 -03:30
AAP-40782 Fix too-low max_workers value, dump running at capacity (#15873)
* Dump running tasks when running out of capacity * Use same logic for max_workers and capacity * Address case where CPU capacity is the constraint * Add a test for correspondence * Fake redis to make tests work
This commit is contained in:
@@ -7,6 +7,7 @@ import time
|
|||||||
import traceback
|
import traceback
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from uuid import uuid4
|
from uuid import uuid4
|
||||||
|
import json
|
||||||
|
|
||||||
import collections
|
import collections
|
||||||
from multiprocessing import Process
|
from multiprocessing import Process
|
||||||
@@ -25,7 +26,10 @@ from ansible_base.lib.logging.runtime import log_excess_runtime
|
|||||||
|
|
||||||
from awx.main.models import UnifiedJob
|
from awx.main.models import UnifiedJob
|
||||||
from awx.main.dispatch import reaper
|
from awx.main.dispatch import reaper
|
||||||
from awx.main.utils.common import convert_mem_str_to_bytes, get_mem_effective_capacity
|
from awx.main.utils.common import get_mem_effective_capacity, get_corrected_memory, get_corrected_cpu, get_cpu_effective_capacity
|
||||||
|
|
||||||
|
# ansible-runner
|
||||||
|
from ansible_runner.utils.capacity import get_mem_in_bytes, get_cpu_count
|
||||||
|
|
||||||
if 'run_callback_receiver' in sys.argv:
|
if 'run_callback_receiver' in sys.argv:
|
||||||
logger = logging.getLogger('awx.main.commands.run_callback_receiver')
|
logger = logging.getLogger('awx.main.commands.run_callback_receiver')
|
||||||
@@ -307,6 +311,41 @@ class WorkerPool(object):
|
|||||||
logger.exception('could not kill {}'.format(worker.pid))
|
logger.exception('could not kill {}'.format(worker.pid))
|
||||||
|
|
||||||
|
|
||||||
|
def get_auto_max_workers():
|
||||||
|
"""Method we normally rely on to get max_workers
|
||||||
|
|
||||||
|
Uses almost same logic as Instance.local_health_check
|
||||||
|
The important thing is to be MORE than Instance.capacity
|
||||||
|
so that the task-manager does not over-schedule this node
|
||||||
|
|
||||||
|
Ideally we would just use the capacity from the database plus reserve workers,
|
||||||
|
but this poses some bootstrap problems where OCP task containers
|
||||||
|
register themselves after startup
|
||||||
|
"""
|
||||||
|
# Get memory from ansible-runner
|
||||||
|
total_memory_gb = get_mem_in_bytes()
|
||||||
|
|
||||||
|
# This may replace memory calculation with a user override
|
||||||
|
corrected_memory = get_corrected_memory(total_memory_gb)
|
||||||
|
|
||||||
|
# Get same number as max forks based on memory, this function takes memory as bytes
|
||||||
|
mem_capacity = get_mem_effective_capacity(corrected_memory, is_control_node=True)
|
||||||
|
|
||||||
|
# Follow same process for CPU capacity constraint
|
||||||
|
cpu_count = get_cpu_count()
|
||||||
|
corrected_cpu = get_corrected_cpu(cpu_count)
|
||||||
|
cpu_capacity = get_cpu_effective_capacity(corrected_cpu, is_control_node=True)
|
||||||
|
|
||||||
|
# Here is what is different from health checks,
|
||||||
|
auto_max = max(mem_capacity, cpu_capacity)
|
||||||
|
|
||||||
|
# add magic number of extra workers to ensure
|
||||||
|
# we have a few extra workers to run the heartbeat
|
||||||
|
auto_max += 7
|
||||||
|
|
||||||
|
return auto_max
|
||||||
|
|
||||||
|
|
||||||
class AutoscalePool(WorkerPool):
|
class AutoscalePool(WorkerPool):
|
||||||
"""
|
"""
|
||||||
An extended pool implementation that automatically scales workers up and
|
An extended pool implementation that automatically scales workers up and
|
||||||
@@ -320,19 +359,7 @@ class AutoscalePool(WorkerPool):
|
|||||||
super(AutoscalePool, self).__init__(*args, **kwargs)
|
super(AutoscalePool, self).__init__(*args, **kwargs)
|
||||||
|
|
||||||
if self.max_workers is None:
|
if self.max_workers is None:
|
||||||
settings_absmem = getattr(settings, 'SYSTEM_TASK_ABS_MEM', None)
|
self.max_workers = get_auto_max_workers()
|
||||||
if settings_absmem is not None:
|
|
||||||
# There are 1073741824 bytes in a gigabyte. Convert bytes to gigabytes by dividing by 2**30
|
|
||||||
total_memory_gb = convert_mem_str_to_bytes(settings_absmem) // 2**30
|
|
||||||
else:
|
|
||||||
total_memory_gb = (psutil.virtual_memory().total >> 30) + 1 # noqa: round up
|
|
||||||
|
|
||||||
# Get same number as max forks based on memory, this function takes memory as bytes
|
|
||||||
self.max_workers = get_mem_effective_capacity(total_memory_gb * 2**30)
|
|
||||||
|
|
||||||
# add magic prime number of extra workers to ensure
|
|
||||||
# we have a few extra workers to run the heartbeat
|
|
||||||
self.max_workers += 7
|
|
||||||
|
|
||||||
# max workers can't be less than min_workers
|
# max workers can't be less than min_workers
|
||||||
self.max_workers = max(self.min_workers, self.max_workers)
|
self.max_workers = max(self.min_workers, self.max_workers)
|
||||||
@@ -346,6 +373,9 @@ class AutoscalePool(WorkerPool):
|
|||||||
self.scale_up_ct = 0
|
self.scale_up_ct = 0
|
||||||
self.worker_count_max = 0
|
self.worker_count_max = 0
|
||||||
|
|
||||||
|
# last time we wrote current tasks, to avoid too much log spam
|
||||||
|
self.last_task_list_log = time.monotonic()
|
||||||
|
|
||||||
def produce_subsystem_metrics(self, metrics_object):
|
def produce_subsystem_metrics(self, metrics_object):
|
||||||
metrics_object.set('dispatcher_pool_scale_up_events', self.scale_up_ct)
|
metrics_object.set('dispatcher_pool_scale_up_events', self.scale_up_ct)
|
||||||
metrics_object.set('dispatcher_pool_active_task_count', sum(len(w.managed_tasks) for w in self.workers))
|
metrics_object.set('dispatcher_pool_active_task_count', sum(len(w.managed_tasks) for w in self.workers))
|
||||||
@@ -463,6 +493,14 @@ class AutoscalePool(WorkerPool):
|
|||||||
self.worker_count_max = new_worker_ct
|
self.worker_count_max = new_worker_ct
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def fast_task_serialization(current_task):
|
||||||
|
try:
|
||||||
|
return str(current_task.get('task')) + ' - ' + str(sorted(current_task.get('args', []))) + ' - ' + str(sorted(current_task.get('kwargs', {})))
|
||||||
|
except Exception:
|
||||||
|
# just make sure this does not make things worse
|
||||||
|
return str(current_task)
|
||||||
|
|
||||||
def write(self, preferred_queue, body):
|
def write(self, preferred_queue, body):
|
||||||
if 'guid' in body:
|
if 'guid' in body:
|
||||||
set_guid(body['guid'])
|
set_guid(body['guid'])
|
||||||
@@ -484,6 +522,15 @@ class AutoscalePool(WorkerPool):
|
|||||||
if isinstance(body, dict):
|
if isinstance(body, dict):
|
||||||
task_name = body.get('task')
|
task_name = body.get('task')
|
||||||
logger.warning(f'Workers maxed, queuing {task_name}, load: {sum(len(w.managed_tasks) for w in self.workers)} / {len(self.workers)}')
|
logger.warning(f'Workers maxed, queuing {task_name}, load: {sum(len(w.managed_tasks) for w in self.workers)} / {len(self.workers)}')
|
||||||
|
# Once every 10 seconds write out task list for debugging
|
||||||
|
if time.monotonic() - self.last_task_list_log >= 10.0:
|
||||||
|
task_counts = {}
|
||||||
|
for worker in self.workers:
|
||||||
|
task_slug = self.fast_task_serialization(worker.current_task)
|
||||||
|
task_counts.setdefault(task_slug, 0)
|
||||||
|
task_counts[task_slug] += 1
|
||||||
|
logger.info(f'Running tasks by count:\n{json.dumps(task_counts, indent=2)}')
|
||||||
|
self.last_task_list_log = time.monotonic()
|
||||||
return super(AutoscalePool, self).write(preferred_queue, body)
|
return super(AutoscalePool, self).write(preferred_queue, body)
|
||||||
except Exception:
|
except Exception:
|
||||||
for conn in connections.all():
|
for conn in connections.all():
|
||||||
|
|||||||
@@ -238,7 +238,7 @@ class AWXConsumerPG(AWXConsumerBase):
|
|||||||
def run(self, *args, **kwargs):
|
def run(self, *args, **kwargs):
|
||||||
super(AWXConsumerPG, self).run(*args, **kwargs)
|
super(AWXConsumerPG, self).run(*args, **kwargs)
|
||||||
|
|
||||||
logger.info(f"Running worker {self.name} listening to queues {self.queues}")
|
logger.info(f"Running {self.name}, workers min={self.pool.min_workers} max={self.pool.max_workers}, listening to queues {self.queues}")
|
||||||
init = False
|
init = False
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
|
|||||||
17
awx/main/tests/data/sleep_task.py
Normal file
17
awx/main/tests/data/sleep_task.py
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
import time
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from awx.main.dispatch import get_task_queuename
|
||||||
|
from awx.main.dispatch.publish import task
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@task(queue=get_task_queuename)
|
||||||
|
def sleep_task(seconds=10, log=False):
|
||||||
|
if log:
|
||||||
|
logger.info('starting sleep_task')
|
||||||
|
time.sleep(seconds)
|
||||||
|
if log:
|
||||||
|
logger.info('finished sleep_task')
|
||||||
@@ -34,40 +34,18 @@ def test_wrapup_does_send_notifications(mocker):
|
|||||||
mock.assert_called_once_with('succeeded')
|
mock.assert_called_once_with('succeeded')
|
||||||
|
|
||||||
|
|
||||||
class FakeRedis:
|
|
||||||
def keys(self, *args, **kwargs):
|
|
||||||
return []
|
|
||||||
|
|
||||||
def set(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def get(self):
|
|
||||||
return None
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_url(cls, *args, **kwargs):
|
|
||||||
return cls()
|
|
||||||
|
|
||||||
def pipeline(self):
|
|
||||||
return self
|
|
||||||
|
|
||||||
|
|
||||||
class TestCallbackBrokerWorker(TransactionTestCase):
|
class TestCallbackBrokerWorker(TransactionTestCase):
|
||||||
@pytest.fixture(autouse=True)
|
@pytest.fixture(autouse=True)
|
||||||
def turn_off_websockets(self):
|
def turn_off_websockets_and_redis(self, fake_redis):
|
||||||
with mock.patch('awx.main.dispatch.worker.callback.emit_event_detail', lambda *a, **kw: None):
|
with mock.patch('awx.main.dispatch.worker.callback.emit_event_detail', lambda *a, **kw: None):
|
||||||
yield
|
yield
|
||||||
|
|
||||||
def get_worker(self):
|
|
||||||
with mock.patch('redis.Redis', new=FakeRedis): # turn off redis stuff
|
|
||||||
return CallbackBrokerWorker()
|
|
||||||
|
|
||||||
def event_create_kwargs(self):
|
def event_create_kwargs(self):
|
||||||
inventory_update = InventoryUpdate.objects.create(source='file', inventory_source=InventorySource.objects.create(source='file'))
|
inventory_update = InventoryUpdate.objects.create(source='file', inventory_source=InventorySource.objects.create(source='file'))
|
||||||
return dict(inventory_update=inventory_update, created=inventory_update.created)
|
return dict(inventory_update=inventory_update, created=inventory_update.created)
|
||||||
|
|
||||||
def test_flush_with_valid_event(self):
|
def test_flush_with_valid_event(self):
|
||||||
worker = self.get_worker()
|
worker = CallbackBrokerWorker()
|
||||||
events = [InventoryUpdateEvent(uuid=str(uuid4()), **self.event_create_kwargs())]
|
events = [InventoryUpdateEvent(uuid=str(uuid4()), **self.event_create_kwargs())]
|
||||||
worker.buff = {InventoryUpdateEvent: events}
|
worker.buff = {InventoryUpdateEvent: events}
|
||||||
worker.flush()
|
worker.flush()
|
||||||
@@ -75,7 +53,7 @@ class TestCallbackBrokerWorker(TransactionTestCase):
|
|||||||
assert InventoryUpdateEvent.objects.filter(uuid=events[0].uuid).count() == 1
|
assert InventoryUpdateEvent.objects.filter(uuid=events[0].uuid).count() == 1
|
||||||
|
|
||||||
def test_flush_with_invalid_event(self):
|
def test_flush_with_invalid_event(self):
|
||||||
worker = self.get_worker()
|
worker = CallbackBrokerWorker()
|
||||||
kwargs = self.event_create_kwargs()
|
kwargs = self.event_create_kwargs()
|
||||||
events = [
|
events = [
|
||||||
InventoryUpdateEvent(uuid=str(uuid4()), stdout='good1', **kwargs),
|
InventoryUpdateEvent(uuid=str(uuid4()), stdout='good1', **kwargs),
|
||||||
@@ -90,7 +68,7 @@ class TestCallbackBrokerWorker(TransactionTestCase):
|
|||||||
assert worker.buff == {InventoryUpdateEvent: [events[1]]}
|
assert worker.buff == {InventoryUpdateEvent: [events[1]]}
|
||||||
|
|
||||||
def test_duplicate_key_not_saved_twice(self):
|
def test_duplicate_key_not_saved_twice(self):
|
||||||
worker = self.get_worker()
|
worker = CallbackBrokerWorker()
|
||||||
events = [InventoryUpdateEvent(uuid=str(uuid4()), **self.event_create_kwargs())]
|
events = [InventoryUpdateEvent(uuid=str(uuid4()), **self.event_create_kwargs())]
|
||||||
worker.buff = {InventoryUpdateEvent: events.copy()}
|
worker.buff = {InventoryUpdateEvent: events.copy()}
|
||||||
worker.flush()
|
worker.flush()
|
||||||
@@ -104,7 +82,7 @@ class TestCallbackBrokerWorker(TransactionTestCase):
|
|||||||
assert worker.buff.get(InventoryUpdateEvent, []) == []
|
assert worker.buff.get(InventoryUpdateEvent, []) == []
|
||||||
|
|
||||||
def test_give_up_on_bad_event(self):
|
def test_give_up_on_bad_event(self):
|
||||||
worker = self.get_worker()
|
worker = CallbackBrokerWorker()
|
||||||
events = [InventoryUpdateEvent(uuid=str(uuid4()), counter=-2, **self.event_create_kwargs())]
|
events = [InventoryUpdateEvent(uuid=str(uuid4()), counter=-2, **self.event_create_kwargs())]
|
||||||
worker.buff = {InventoryUpdateEvent: events.copy()}
|
worker.buff = {InventoryUpdateEvent: events.copy()}
|
||||||
|
|
||||||
@@ -117,7 +95,7 @@ class TestCallbackBrokerWorker(TransactionTestCase):
|
|||||||
assert InventoryUpdateEvent.objects.filter(uuid=events[0].uuid).count() == 0 # sanity
|
assert InventoryUpdateEvent.objects.filter(uuid=events[0].uuid).count() == 0 # sanity
|
||||||
|
|
||||||
def test_flush_with_empty_buffer(self):
|
def test_flush_with_empty_buffer(self):
|
||||||
worker = self.get_worker()
|
worker = CallbackBrokerWorker()
|
||||||
worker.buff = {InventoryUpdateEvent: []}
|
worker.buff = {InventoryUpdateEvent: []}
|
||||||
with mock.patch.object(InventoryUpdateEvent.objects, 'bulk_create') as flush_mock:
|
with mock.patch.object(InventoryUpdateEvent.objects, 'bulk_create') as flush_mock:
|
||||||
worker.flush()
|
worker.flush()
|
||||||
@@ -127,7 +105,7 @@ class TestCallbackBrokerWorker(TransactionTestCase):
|
|||||||
# In postgres, text fields reject NUL character, 0x00
|
# In postgres, text fields reject NUL character, 0x00
|
||||||
# tests use sqlite3 which will not raise an error
|
# tests use sqlite3 which will not raise an error
|
||||||
# but we can still test that it is sanitized before saving
|
# but we can still test that it is sanitized before saving
|
||||||
worker = self.get_worker()
|
worker = CallbackBrokerWorker()
|
||||||
kwargs = self.event_create_kwargs()
|
kwargs = self.event_create_kwargs()
|
||||||
events = [InventoryUpdateEvent(uuid=str(uuid4()), stdout="\x00", **kwargs)]
|
events = [InventoryUpdateEvent(uuid=str(uuid4()), stdout="\x00", **kwargs)]
|
||||||
assert "\x00" in events[0].stdout # sanity
|
assert "\x00" in events[0].stdout # sanity
|
||||||
|
|||||||
@@ -63,6 +63,33 @@ def swagger_autogen(requests=__SWAGGER_REQUESTS__):
|
|||||||
return requests
|
return requests
|
||||||
|
|
||||||
|
|
||||||
|
class FakeRedis:
|
||||||
|
def keys(self, *args, **kwargs):
|
||||||
|
return []
|
||||||
|
|
||||||
|
def set(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def get(self):
|
||||||
|
return None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_url(cls, *args, **kwargs):
|
||||||
|
return cls()
|
||||||
|
|
||||||
|
def pipeline(self):
|
||||||
|
return self
|
||||||
|
|
||||||
|
def ping(self):
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def fake_redis():
|
||||||
|
with mock.patch('redis.Redis', new=FakeRedis): # turn off redis stuff
|
||||||
|
yield
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def user():
|
def user():
|
||||||
def u(name, is_superuser=False):
|
def u(name, is_superuser=False):
|
||||||
|
|||||||
@@ -3,6 +3,10 @@ import pytest
|
|||||||
# AWX
|
# AWX
|
||||||
from awx.main.ha import is_ha_environment
|
from awx.main.ha import is_ha_environment
|
||||||
from awx.main.models.ha import Instance
|
from awx.main.models.ha import Instance
|
||||||
|
from awx.main.dispatch.pool import get_auto_max_workers
|
||||||
|
|
||||||
|
# Django
|
||||||
|
from django.test.utils import override_settings
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.django_db
|
@pytest.mark.django_db
|
||||||
@@ -17,3 +21,25 @@ def test_db_localhost():
|
|||||||
Instance.objects.create(hostname='foo', node_type='hybrid')
|
Instance.objects.create(hostname='foo', node_type='hybrid')
|
||||||
Instance.objects.create(hostname='bar', node_type='execution')
|
Instance.objects.create(hostname='bar', node_type='execution')
|
||||||
assert is_ha_environment() is False
|
assert is_ha_environment() is False
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.django_db
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
'settings',
|
||||||
|
[
|
||||||
|
dict(SYSTEM_TASK_ABS_MEM='16Gi', SYSTEM_TASK_ABS_CPU='24', SYSTEM_TASK_FORKS_MEM=400, SYSTEM_TASK_FORKS_CPU=4),
|
||||||
|
dict(SYSTEM_TASK_ABS_MEM='124Gi', SYSTEM_TASK_ABS_CPU='2', SYSTEM_TASK_FORKS_MEM=None, SYSTEM_TASK_FORKS_CPU=None),
|
||||||
|
],
|
||||||
|
ids=['cpu_dominated', 'memory_dominated'],
|
||||||
|
)
|
||||||
|
def test_dispatcher_max_workers_reserve(settings, fake_redis):
|
||||||
|
"""This tests that the dispatcher max_workers matches instance capacity
|
||||||
|
|
||||||
|
Assumes capacity_adjustment is 1,
|
||||||
|
plus reserve worker count
|
||||||
|
"""
|
||||||
|
with override_settings(**settings):
|
||||||
|
i = Instance.objects.create(hostname='test-1', node_type='hybrid')
|
||||||
|
i.local_health_check()
|
||||||
|
|
||||||
|
assert get_auto_max_workers() == i.capacity + 7, (i.cpu, i.memory, i.cpu_capacity, i.mem_capacity)
|
||||||
|
|||||||
16
tools/scripts/firehose_tasks.py
Executable file
16
tools/scripts/firehose_tasks.py
Executable file
@@ -0,0 +1,16 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
from django import setup
|
||||||
|
|
||||||
|
from awx import prepare_env
|
||||||
|
|
||||||
|
prepare_env()
|
||||||
|
|
||||||
|
setup()
|
||||||
|
|
||||||
|
# Keeping this in test folder allows it to be importable
|
||||||
|
from awx.main.tests.data.sleep_task import sleep_task
|
||||||
|
|
||||||
|
|
||||||
|
for i in range(634):
|
||||||
|
sleep_task.delay()
|
||||||
Reference in New Issue
Block a user