Update tests to expect batch_size to agree with changes

Remove del load_credentials to resolve CI issue
Sort both bulk updates and add batch size to facts bulk update to resolve deadlock issue
2026-02-05 03:24:50 -03:30 · 2025-04-18 16:08:18 -04:00 · 2025-04-18 15:22:35 -04:00 · 2025-04-18 15:19:16 -04:00 · 2025-04-18 15:16:41 -04:00 · 2025-04-18 15:16:08 -04:00
12 changed files with 241 additions and 48 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -150,6 +150,8 @@ use_dev_supervisor.txt

 awx/ui/src
 awx/ui/build
+awx/ui/.ui-built
+awx/ui_next

 # Docs build stuff
 docs/docsite/build/
--- a/awx/main/dispatch/pool.py
+++ b/awx/main/dispatch/pool.py
@@ -7,6 +7,7 @@ import time
 import traceback
 from datetime import datetime
 from uuid import uuid4
+import json

 import collections
 from multiprocessing import Process
@@ -25,7 +26,10 @@ from ansible_base.lib.logging.runtime import log_excess_runtime

 from awx.main.models import UnifiedJob
 from awx.main.dispatch import reaper
-from awx.main.utils.common import convert_mem_str_to_bytes, get_mem_effective_capacity
+from awx.main.utils.common import get_mem_effective_capacity, get_corrected_memory, get_corrected_cpu, get_cpu_effective_capacity
+
+# ansible-runner
+from ansible_runner.utils.capacity import get_mem_in_bytes, get_cpu_count

 if 'run_callback_receiver' in sys.argv:
    logger = logging.getLogger('awx.main.commands.run_callback_receiver')
@@ -307,6 +311,41 @@ class WorkerPool(object):
            logger.exception('could not kill {}'.format(worker.pid))


+def get_auto_max_workers():
+    """Method we normally rely on to get max_workers
+
+    Uses almost same logic as Instance.local_health_check
+    The important thing is to be MORE than Instance.capacity
+    so that the task-manager does not over-schedule this node
+
+    Ideally we would just use the capacity from the database plus reserve workers,
+    but this poses some bootstrap problems where OCP task containers
+    register themselves after startup
+    """
+    # Get memory from ansible-runner
+    total_memory_gb = get_mem_in_bytes()
+
+    # This may replace memory calculation with a user override
+    corrected_memory = get_corrected_memory(total_memory_gb)
+
+    # Get same number as max forks based on memory, this function takes memory as bytes
+    mem_capacity = get_mem_effective_capacity(corrected_memory, is_control_node=True)
+
+    # Follow same process for CPU capacity constraint
+    cpu_count = get_cpu_count()
+    corrected_cpu = get_corrected_cpu(cpu_count)
+    cpu_capacity = get_cpu_effective_capacity(corrected_cpu, is_control_node=True)
+
+    # Here is what is different from health checks,
+    auto_max = max(mem_capacity, cpu_capacity)
+
+    # add magic number of extra workers to ensure
+    # we have a few extra workers to run the heartbeat
+    auto_max += 7
+
+    return auto_max
+
+
 class AutoscalePool(WorkerPool):
    """
    An extended pool implementation that automatically scales workers up and
@@ -320,19 +359,7 @@ class AutoscalePool(WorkerPool):
        super(AutoscalePool, self).__init__(*args, **kwargs)

        if self.max_workers is None:
-            settings_absmem = getattr(settings, 'SYSTEM_TASK_ABS_MEM', None)
-            if settings_absmem is not None:
-                # There are 1073741824 bytes in a gigabyte. Convert bytes to gigabytes by dividing by 2**30
-                total_memory_gb = convert_mem_str_to_bytes(settings_absmem) // 2**30
-            else:
-                total_memory_gb = (psutil.virtual_memory().total >> 30) + 1  # noqa: round up
-
-            # Get same number as max forks based on memory, this function takes memory as bytes
-            self.max_workers = get_mem_effective_capacity(total_memory_gb * 2**30)
-
-            # add magic prime number of extra workers to ensure
-            # we have a few extra workers to run the heartbeat
-            self.max_workers += 7
+            self.max_workers = get_auto_max_workers()

        # max workers can't be less than min_workers
        self.max_workers = max(self.min_workers, self.max_workers)
@@ -346,6 +373,9 @@ class AutoscalePool(WorkerPool):
        self.scale_up_ct = 0
        self.worker_count_max = 0

+        # last time we wrote current tasks, to avoid too much log spam
+        self.last_task_list_log = time.monotonic()
+
    def produce_subsystem_metrics(self, metrics_object):
        metrics_object.set('dispatcher_pool_scale_up_events', self.scale_up_ct)
        metrics_object.set('dispatcher_pool_active_task_count', sum(len(w.managed_tasks) for w in self.workers))
@@ -463,6 +493,14 @@ class AutoscalePool(WorkerPool):
                self.worker_count_max = new_worker_ct
            return ret

+    @staticmethod
+    def fast_task_serialization(current_task):
+        try:
+            return str(current_task.get('task')) + ' - ' + str(sorted(current_task.get('args', []))) + ' - ' + str(sorted(current_task.get('kwargs', {})))
+        except Exception:
+            # just make sure this does not make things worse
+            return str(current_task)
+
    def write(self, preferred_queue, body):
        if 'guid' in body:
            set_guid(body['guid'])
@@ -484,6 +522,15 @@ class AutoscalePool(WorkerPool):
                if isinstance(body, dict):
                    task_name = body.get('task')
                logger.warning(f'Workers maxed, queuing {task_name}, load: {sum(len(w.managed_tasks) for w in self.workers)} / {len(self.workers)}')
+                # Once every 10 seconds write out task list for debugging
+                if time.monotonic() - self.last_task_list_log >= 10.0:
+                    task_counts = {}
+                    for worker in self.workers:
+                        task_slug = self.fast_task_serialization(worker.current_task)
+                        task_counts.setdefault(task_slug, 0)
+                        task_counts[task_slug] += 1
+                    logger.info(f'Running tasks by count:\n{json.dumps(task_counts, indent=2)}')
+                    self.last_task_list_log = time.monotonic()
                return super(AutoscalePool, self).write(preferred_queue, body)
        except Exception:
            for conn in connections.all():
--- a/awx/main/dispatch/worker/base.py
+++ b/awx/main/dispatch/worker/base.py
@@ -238,7 +238,7 @@ class AWXConsumerPG(AWXConsumerBase):
    def run(self, *args, **kwargs):
        super(AWXConsumerPG, self).run(*args, **kwargs)

-        logger.info(f"Running worker {self.name} listening to queues {self.queues}")
+        logger.info(f"Running {self.name}, workers min={self.pool.min_workers} max={self.pool.max_workers}, listening to queues {self.queues}")
        init = False

        while True:
--- a/awx/main/models/events.py
+++ b/awx/main/models/events.py
@@ -602,7 +602,7 @@ class JobEvent(BasePlaybookEvent):
                    h.last_job_host_summary_id = host_mapping[h.id]
                    updated_hosts.add(h)

-            Host.objects.bulk_update(list(updated_hosts), ['last_job_id', 'last_job_host_summary_id'], batch_size=100)
+            Host.objects.bulk_update(sorted(updated_hosts, key=lambda host: host.id), ['last_job_id', 'last_job_host_summary_id'], batch_size=100)

            # Create/update Host Metrics
            self._update_host_metrics(updated_hosts_list)
--- a/awx/main/tasks/facts.py
+++ b/awx/main/tasks/facts.py
@@ -62,7 +62,8 @@ def start_fact_cache(hosts, destination, log_data, timeout=None, inventory_id=No


 def raw_update_hosts(host_list):
-    Host.objects.bulk_update(host_list, ['ansible_facts', 'ansible_facts_modified'])
+    host_list = sorted(host_list, key=lambda host: host.id)
+    Host.objects.bulk_update(host_list, ['ansible_facts', 'ansible_facts_modified'], batch_size=100)


 def update_hosts(host_list, max_tries=5):
--- a/awx/main/tests/data/sleep_task.py
+++ b/awx/main/tests/data/sleep_task.py
@@ -0,0 +1,17 @@
+import time
+import logging
+
+from awx.main.dispatch import get_task_queuename
+from awx.main.dispatch.publish import task
+
+
+logger = logging.getLogger(__name__)
+
+
+@task(queue=get_task_queuename)
+def sleep_task(seconds=10, log=False):
+    if log:
+        logger.info('starting sleep_task')
+    time.sleep(seconds)
+    if log:
+        logger.info('finished sleep_task')
--- a/awx/main/tests/functional/commands/test_callback_receiver.py
+++ b/awx/main/tests/functional/commands/test_callback_receiver.py
@@ -34,40 +34,18 @@ def test_wrapup_does_send_notifications(mocker):
    mock.assert_called_once_with('succeeded')


-class FakeRedis:
-    def keys(self, *args, **kwargs):
-        return []
-
-    def set(self):
-        pass
-
-    def get(self):
-        return None
-
-    @classmethod
-    def from_url(cls, *args, **kwargs):
-        return cls()
-
-    def pipeline(self):
-        return self
-
-
 class TestCallbackBrokerWorker(TransactionTestCase):
    @pytest.fixture(autouse=True)
-    def turn_off_websockets(self):
+    def turn_off_websockets_and_redis(self, fake_redis):
        with mock.patch('awx.main.dispatch.worker.callback.emit_event_detail', lambda *a, **kw: None):
            yield

-    def get_worker(self):
-        with mock.patch('redis.Redis', new=FakeRedis):  # turn off redis stuff
-            return CallbackBrokerWorker()
-
    def event_create_kwargs(self):
        inventory_update = InventoryUpdate.objects.create(source='file', inventory_source=InventorySource.objects.create(source='file'))
        return dict(inventory_update=inventory_update, created=inventory_update.created)

    def test_flush_with_valid_event(self):
-        worker = self.get_worker()
+        worker = CallbackBrokerWorker()
        events = [InventoryUpdateEvent(uuid=str(uuid4()), **self.event_create_kwargs())]
        worker.buff = {InventoryUpdateEvent: events}
        worker.flush()
@@ -75,7 +53,7 @@ class TestCallbackBrokerWorker(TransactionTestCase):
        assert InventoryUpdateEvent.objects.filter(uuid=events[0].uuid).count() == 1

    def test_flush_with_invalid_event(self):
-        worker = self.get_worker()
+        worker = CallbackBrokerWorker()
        kwargs = self.event_create_kwargs()
        events = [
            InventoryUpdateEvent(uuid=str(uuid4()), stdout='good1', **kwargs),
@@ -90,7 +68,7 @@ class TestCallbackBrokerWorker(TransactionTestCase):
        assert worker.buff == {InventoryUpdateEvent: [events[1]]}

    def test_duplicate_key_not_saved_twice(self):
-        worker = self.get_worker()
+        worker = CallbackBrokerWorker()
        events = [InventoryUpdateEvent(uuid=str(uuid4()), **self.event_create_kwargs())]
        worker.buff = {InventoryUpdateEvent: events.copy()}
        worker.flush()
@@ -104,7 +82,7 @@ class TestCallbackBrokerWorker(TransactionTestCase):
        assert worker.buff.get(InventoryUpdateEvent, []) == []

    def test_give_up_on_bad_event(self):
-        worker = self.get_worker()
+        worker = CallbackBrokerWorker()
        events = [InventoryUpdateEvent(uuid=str(uuid4()), counter=-2, **self.event_create_kwargs())]
        worker.buff = {InventoryUpdateEvent: events.copy()}

@@ -117,7 +95,7 @@ class TestCallbackBrokerWorker(TransactionTestCase):
        assert InventoryUpdateEvent.objects.filter(uuid=events[0].uuid).count() == 0  # sanity

    def test_flush_with_empty_buffer(self):
-        worker = self.get_worker()
+        worker = CallbackBrokerWorker()
        worker.buff = {InventoryUpdateEvent: []}
        with mock.patch.object(InventoryUpdateEvent.objects, 'bulk_create') as flush_mock:
            worker.flush()
@@ -127,7 +105,7 @@ class TestCallbackBrokerWorker(TransactionTestCase):
        # In postgres, text fields reject NUL character, 0x00
        # tests use sqlite3 which will not raise an error
        # but we can still test that it is sanitized before saving
-        worker = self.get_worker()
+        worker = CallbackBrokerWorker()
        kwargs = self.event_create_kwargs()
        events = [InventoryUpdateEvent(uuid=str(uuid4()), stdout="\x00", **kwargs)]
        assert "\x00" in events[0].stdout  # sanity
--- a/awx/main/tests/functional/conftest.py
+++ b/awx/main/tests/functional/conftest.py
@@ -63,6 +63,33 @@ def swagger_autogen(requests=__SWAGGER_REQUESTS__):
    return requests


+class FakeRedis:
+    def keys(self, *args, **kwargs):
+        return []
+
+    def set(self):
+        pass
+
+    def get(self):
+        return None
+
+    @classmethod
+    def from_url(cls, *args, **kwargs):
+        return cls()
+
+    def pipeline(self):
+        return self
+
+    def ping(self):
+        return
+
+
+@pytest.fixture
+def fake_redis():
+    with mock.patch('redis.Redis', new=FakeRedis):  # turn off redis stuff
+        yield
+
+
@pytest.fixture
 def user():
    def u(name, is_superuser=False):
--- a/awx/main/tests/functional/models/test_ha.py
+++ b/awx/main/tests/functional/models/test_ha.py
@@ -3,6 +3,10 @@ import pytest
 # AWX
 from awx.main.ha import is_ha_environment
 from awx.main.models.ha import Instance
+from awx.main.dispatch.pool import get_auto_max_workers
+
+# Django
+from django.test.utils import override_settings


@pytest.mark.django_db
@@ -17,3 +21,25 @@ def test_db_localhost():
    Instance.objects.create(hostname='foo', node_type='hybrid')
    Instance.objects.create(hostname='bar', node_type='execution')
    assert is_ha_environment() is False
+
+
+@pytest.mark.django_db
+@pytest.mark.parametrize(
+    'settings',
+    [
+        dict(SYSTEM_TASK_ABS_MEM='16Gi', SYSTEM_TASK_ABS_CPU='24', SYSTEM_TASK_FORKS_MEM=400, SYSTEM_TASK_FORKS_CPU=4),
+        dict(SYSTEM_TASK_ABS_MEM='124Gi', SYSTEM_TASK_ABS_CPU='2', SYSTEM_TASK_FORKS_MEM=None, SYSTEM_TASK_FORKS_CPU=None),
+    ],
+    ids=['cpu_dominated', 'memory_dominated'],
+)
+def test_dispatcher_max_workers_reserve(settings, fake_redis):
+    """This tests that the dispatcher max_workers matches instance capacity
+
+    Assumes capacity_adjustment is 1,
+    plus reserve worker count
+    """
+    with override_settings(**settings):
+        i = Instance.objects.create(hostname='test-1', node_type='hybrid')
+        i.local_health_check()
+
+        assert get_auto_max_workers() == i.capacity + 7, (i.cpu, i.memory, i.cpu_capacity, i.mem_capacity)
--- a/awx/main/tests/live/tests/test_host_update_contention.py
+++ b/awx/main/tests/live/tests/test_host_update_contention.py
@@ -0,0 +1,79 @@
+import multiprocessing
+import random
+
+from django.db import connection
+from django.utils.timezone import now
+
+from awx.main.models import Inventory, Host
+
+
+def worker_delete_target(ready_event, continue_event, field_name):
+    """Runs the bulk update, will be called in duplicate, in parallel"""
+    inv = Inventory.objects.get(organization__name='Default', name='test_host_update_contention')
+    host_list = list(inv.hosts.all())
+    random.shuffle(host_list)
+    for i, host in enumerate(host_list):
+        setattr(host, field_name, f'my_var: {i}')
+
+    # ready to do the bulk_update
+    print('worker has loaded all the hosts needed')
+    ready_event.set()
+    # wait for the coordination message
+    continue_event.wait()
+
+    # # presumed fix
+    # host_list = sorted(host_list, key=lambda host: host.id)
+
+    # NOTE: did not reproduce the bug without batch_size
+    Host.objects.bulk_update(host_list, [field_name], batch_size=100)
+    print('finished doing the bulk update in worker')
+
+
+def test_host_update_contention(default_org):
+    inv_kwargs = dict(organization=default_org, name='test_host_update_contention')
+
+    if Inventory.objects.filter(**inv_kwargs).exists():
+        inv = Inventory.objects.get(**inv_kwargs).delete()
+
+    inv = Inventory.objects.create(**inv_kwargs)
+    right_now = now()
+    hosts = [Host(inventory=inv, name=f'host-{i}', created=right_now, modified=right_now) for i in range(1000)]
+    print('bulk creating hosts')
+    Host.objects.bulk_create(hosts)
+
+    # sanity check
+    for host in hosts:
+        assert not host.variables
+
+    # Force our worker pool to make their own connection
+    connection.close()
+
+    ready_events = [multiprocessing.Event() for _ in range(2)]
+    continue_event = multiprocessing.Event()
+
+    print('spawning processes for concurrent bulk updates')
+    processes = []
+    fields = ['variables', 'ansible_facts']
+    for i in range(2):
+        p = multiprocessing.Process(target=worker_delete_target, args=(ready_events[i], continue_event, fields[i]))
+        processes.append(p)
+        p.start()
+
+    # Assure both processes are connected and have loaded their host list
+    for e in ready_events:
+        print('waiting on subprocess ready event')
+        e.wait()
+
+    # Begin the bulk_update queries
+    print('setting the continue event for the workers')
+    continue_event.set()
+
+    # if a Deadloack happens it will probably be surfaced by result here
+    print('waiting on the workers to finish the bulk_update')
+    for p in processes:
+        p.join()
+
+    print('checking workers have variables set')
+    for host in inv.hosts.all():
+        assert host.variables.startswith('my_var:')
+        assert host.ansible_facts.startswith('my_var:')
--- a/awx/main/tests/unit/models/test_jobs.py
+++ b/awx/main/tests/unit/models/test_jobs.py
@@ -103,7 +103,7 @@ def test_finish_job_fact_cache_with_existing_data(hosts, mocker, tmpdir, ref_tim
        assert host.ansible_facts_modified == ref_time
    assert hosts[1].ansible_facts == ansible_facts_new
    assert hosts[1].ansible_facts_modified > ref_time
-    bulk_update.assert_called_once_with([hosts[1]], ['ansible_facts', 'ansible_facts_modified'])
+    bulk_update.assert_called_once_with([hosts[1]], ['ansible_facts', 'ansible_facts_modified'], batch_size=100)


 def test_finish_job_fact_cache_with_bad_data(hosts, mocker, tmpdir):
@@ -139,4 +139,4 @@ def test_finish_job_fact_cache_clear(hosts, mocker, ref_time, tmpdir):
        assert host.ansible_facts_modified == ref_time
    assert hosts[1].ansible_facts == {}
    assert hosts[1].ansible_facts_modified > ref_time
-    bulk_update.assert_called_once_with([hosts[1]], ['ansible_facts', 'ansible_facts_modified'])
+    bulk_update.assert_called_once_with([hosts[1]], ['ansible_facts', 'ansible_facts_modified'], batch_size=100)
--- a/tools/scripts/firehose_tasks.py
+++ b/tools/scripts/firehose_tasks.py
@@ -0,0 +1,16 @@
+#!/usr/bin/env python
+
+from django import setup
+
+from awx import prepare_env
+
+prepare_env()
+
+setup()
+
+# Keeping this in test folder allows it to be importable
+from awx.main.tests.data.sleep_task import sleep_task
+
+
+for i in range(634):
+    sleep_task.delay()
Author	SHA1	Message	Date
Lila	98dc60f9d6	Update tests to expect batch_size to agree with changes	2025-04-18 16:08:18 -04:00
Lila	29addb6ad0	Remove del load_credentials to resolve CI issue	2025-04-18 15:22:35 -04:00
Lila	c2f8acebb1	Sort both bulk updates and add batch size to facts bulk update to resolve deadlock issue	2025-04-18 15:19:16 -04:00
Alan Rominger	65d309f44a	Comment out actual fix	2025-04-18 15:16:41 -04:00
Alan Rominger	8bcc65fe80	Demo of sorting hosts	2025-04-18 15:16:08 -04:00
Alan Rominger	db6e8b9bad	AAP-40782 Fix too-low max_workers value, dump running at capacity (#15873 ) * Dump running tasks when running out of capacity * Use same logic for max_workers and capacity * Address case where CPU capacity is the constraint * Add a test for correspondence * Fake redis to make tests work	2025-04-16 16:43:21 -04:00
Hao Liu	483417762f	Git ignore legacy UI files (#15946 )	2025-04-16 14:40:12 -04:00