From 2c7184f9d28dd2093299bf53919277da347e0145 Mon Sep 17 00:00:00 2001 From: Alan Rominger Date: Fri, 11 Aug 2023 11:13:56 -0400 Subject: [PATCH] Add a retry to update host facts on deadlocks (#14325) --- awx/main/tasks/facts.py | 29 ++++++++++++--- awx/main/tests/functional/test_jobs.py | 49 ++++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 4 deletions(-) diff --git a/awx/main/tasks/facts.py b/awx/main/tasks/facts.py index 3db5f13091..a47f3890c2 100644 --- a/awx/main/tasks/facts.py +++ b/awx/main/tasks/facts.py @@ -9,6 +9,7 @@ from django.conf import settings from django.db.models.query import QuerySet from django.utils.encoding import smart_str from django.utils.timezone import now +from django.db import OperationalError # AWX from awx.main.utils.common import log_excess_runtime @@ -57,6 +58,28 @@ def start_fact_cache(hosts, destination, log_data, timeout=None, inventory_id=No return None +def raw_update_hosts(host_list): + Host.objects.bulk_update(host_list, ['ansible_facts', 'ansible_facts_modified']) + + +def update_hosts(host_list, max_tries=5): + if not host_list: + return + for i in range(max_tries): + try: + raw_update_hosts(host_list) + except OperationalError as exc: + # Deadlocks can happen if this runs at the same time as another large query + # inventory updates and updating last_job_host_summary are candidates for conflict + # but these would resolve easily on a retry + if i + 1 < max_tries: + logger.info(f'OperationalError (suspected deadlock) saving host facts retry {i}, message: {exc}') + continue + else: + raise + break + + @log_excess_runtime( logger, debug_cutoff=0.01, @@ -111,7 +134,5 @@ def finish_fact_cache(hosts, destination, facts_write_time, log_data, job_id=Non system_tracking_logger.info('Facts cleared for inventory {} host {}'.format(smart_str(host.inventory.name), smart_str(host.name))) log_data['cleared_ct'] += 1 if len(hosts_to_update) > 100: - Host.objects.bulk_update(hosts_to_update, ['ansible_facts', 'ansible_facts_modified']) - hosts_to_update = [] - if hosts_to_update: - Host.objects.bulk_update(hosts_to_update, ['ansible_facts', 'ansible_facts_modified']) + update_hosts(hosts_to_update) + update_hosts(hosts_to_update) diff --git a/awx/main/tests/functional/test_jobs.py b/awx/main/tests/functional/test_jobs.py index da3b9fd57c..972f10b411 100644 --- a/awx/main/tests/functional/test_jobs.py +++ b/awx/main/tests/functional/test_jobs.py @@ -6,6 +6,7 @@ import json from awx.main.models import ( Job, Instance, + Host, JobHostSummary, InventoryUpdate, InventorySource, @@ -18,6 +19,9 @@ from awx.main.models import ( ExecutionEnvironment, ) from awx.main.tasks.system import cluster_node_heartbeat +from awx.main.tasks.facts import update_hosts + +from django.db import OperationalError from django.test.utils import override_settings @@ -112,6 +116,51 @@ def test_job_notification_host_data(inventory, machine_credential, project, job_ } +@pytest.mark.django_db +class TestAnsibleFactsSave: + current_call = 0 + + def test_update_hosts_deleted_host(self, inventory): + hosts = [Host.objects.create(inventory=inventory, name=f'foo{i}') for i in range(3)] + for host in hosts: + host.ansible_facts = {'foo': 'bar'} + last_pk = hosts[-1].pk + assert inventory.hosts.count() == 3 + Host.objects.get(pk=last_pk).delete() + assert inventory.hosts.count() == 2 + update_hosts(hosts) + assert inventory.hosts.count() == 2 + for host in inventory.hosts.all(): + host.refresh_from_db() + assert host.ansible_facts == {'foo': 'bar'} + + def test_update_hosts_forever_deadlock(self, inventory, mocker): + hosts = [Host.objects.create(inventory=inventory, name=f'foo{i}') for i in range(3)] + for host in hosts: + host.ansible_facts = {'foo': 'bar'} + db_mock = mocker.patch('awx.main.tasks.facts.Host.objects.bulk_update') + db_mock.side_effect = OperationalError('deadlock detected') + with pytest.raises(OperationalError): + update_hosts(hosts) + + def fake_bulk_update(self, host_list): + if self.current_call > 2: + return Host.objects.bulk_update(host_list, ['ansible_facts', 'ansible_facts_modified']) + self.current_call += 1 + raise OperationalError('deadlock detected') + + def test_update_hosts_resolved_deadlock(self, inventory, mocker): + hosts = [Host.objects.create(inventory=inventory, name=f'foo{i}') for i in range(3)] + for host in hosts: + host.ansible_facts = {'foo': 'bar'} + self.current_call = 0 + mocker.patch('awx.main.tasks.facts.raw_update_hosts', new=self.fake_bulk_update) + update_hosts(hosts) + for host in inventory.hosts.all(): + host.refresh_from_db() + assert host.ansible_facts == {'foo': 'bar'} + + @pytest.mark.django_db class TestLaunchConfig: def test_null_creation_from_prompts(self):