AAP-65054 Fix bugs where concurrent jobs would clear facts of unrelated hosts (#16318)

* Add new tests for bug saving concurrent facts

* Fix first bug and improve tests

* Fix new bug where concurrent job clears facts from other job in unwanted way

* minor test fixes

* Add in missing playbook

* Fix host reference for constructed inventory

* Increase speed for concurrent fact tests

* Make test a bit faster

* Fix linters

* Add some functional tests

* Remove the sanity test

* Agent markers added

* Address SonarCloud

* Do backdating method, resolving stricter assertions

* Address coderabbit comments

* Address review comment with qs only method

* Delete missed sleep statement

* Add more coverage
This commit is contained in:
Alan Rominger
2026-03-05 07:58:32 -05:00
committed by GitHub
parent 0995f7c5fe
commit be5fbf365e
9 changed files with 691 additions and 42 deletions

View File

@@ -845,6 +845,21 @@ class Job(UnifiedJob, JobOptions, SurveyJobMixin, JobNotificationMixin, TaskMana
def get_notification_friendly_name(self): def get_notification_friendly_name(self):
return "Job" return "Job"
def get_source_hosts_for_constructed_inventory(self):
"""Return a QuerySet of the source (input inventory) hosts for a constructed inventory.
Constructed inventory hosts have an instance_id pointing to the real
host in the input inventory. This resolves those references and returns
a proper QuerySet (never a list), suitable for use with finish_fact_cache.
"""
Host = JobHostSummary._meta.get_field('host').related_model
if not self.inventory_id:
return Host.objects.none()
id_field = Host._meta.get_field('id')
return Host.objects.filter(id__in=self.inventory.hosts.exclude(instance_id='').values_list(Cast('instance_id', output_field=id_field))).only(
*HOST_FACTS_FIELDS
)
def get_hosts_for_fact_cache(self): def get_hosts_for_fact_cache(self):
""" """
Builds the queryset to use for writing or finalizing the fact cache Builds the queryset to use for writing or finalizing the fact cache
@@ -852,17 +867,15 @@ class Job(UnifiedJob, JobOptions, SurveyJobMixin, JobNotificationMixin, TaskMana
For constructed inventories, that means the original (input inventory) hosts For constructed inventories, that means the original (input inventory) hosts
when slicing, that means only returning hosts in that slice when slicing, that means only returning hosts in that slice
""" """
Host = JobHostSummary._meta.get_field('host').related_model
if not self.inventory_id: if not self.inventory_id:
Host = JobHostSummary._meta.get_field('host').related_model
return Host.objects.none() return Host.objects.none()
if self.inventory.kind == 'constructed': if self.inventory.kind == 'constructed':
id_field = Host._meta.get_field('id') host_qs = self.get_source_hosts_for_constructed_inventory()
host_qs = Host.objects.filter(id__in=self.inventory.hosts.exclude(instance_id='').values_list(Cast('instance_id', output_field=id_field)))
else: else:
host_qs = self.inventory.hosts host_qs = self.inventory.hosts.only(*HOST_FACTS_FIELDS)
host_qs = host_qs.only(*HOST_FACTS_FIELDS)
host_qs = self.inventory.get_sliced_hosts(host_qs, self.job_slice_number, self.job_slice_count) host_qs = self.inventory.get_sliced_hosts(host_qs, self.job_slice_number, self.job_slice_count)
return host_qs return host_qs

View File

@@ -51,7 +51,14 @@ def start_fact_cache(hosts, artifacts_dir, timeout=None, inventory_id=None, log_
os.chmod(f.name, 0o600) os.chmod(f.name, 0o600)
json.dump(host.ansible_facts, f) json.dump(host.ansible_facts, f)
log_data['written_ct'] += 1 log_data['written_ct'] += 1
last_write_time = os.path.getmtime(filepath) # Backdate the file by 2 seconds so finish_fact_cache can reliably
# distinguish these reference files from files updated by ansible.
# This guarantees fact file mtime < summary file mtime even with
# zipfile's 2-second timestamp rounding during artifact transfer.
mtime = os.path.getmtime(filepath)
backdated = mtime - 2
os.utime(filepath, (backdated, backdated))
last_write_time = backdated
except IOError: except IOError:
logger.error(f'facts for host {smart_str(host.name)} could not be cached') logger.error(f'facts for host {smart_str(host.name)} could not be cached')
continue continue
@@ -74,7 +81,7 @@ def start_fact_cache(hosts, artifacts_dir, timeout=None, inventory_id=None, log_
msg='Inventory {inventory_id} host facts: updated {updated_ct}, cleared {cleared_ct}, unchanged {unmodified_ct}, took {delta:.3f} s', msg='Inventory {inventory_id} host facts: updated {updated_ct}, cleared {cleared_ct}, unchanged {unmodified_ct}, took {delta:.3f} s',
add_log_data=True, add_log_data=True,
) )
def finish_fact_cache(artifacts_dir, job_id=None, inventory_id=None, log_data=None): def finish_fact_cache(host_qs, artifacts_dir, job_id=None, inventory_id=None, job_created=None, log_data=None):
log_data = log_data or {} log_data = log_data or {}
log_data['inventory_id'] = inventory_id log_data['inventory_id'] = inventory_id
log_data['updated_ct'] = 0 log_data['updated_ct'] = 0
@@ -95,7 +102,7 @@ def finish_fact_cache(artifacts_dir, job_id=None, inventory_id=None, log_data=No
return return
host_names = summary.get('hosts_cached', []) host_names = summary.get('hosts_cached', [])
hosts_cached = Host.objects.filter(name__in=host_names).order_by('id').iterator() hosts_cached = host_qs.filter(name__in=host_names).order_by('id').iterator()
# Path where individual fact files were written # Path where individual fact files were written
fact_cache_dir = os.path.join(artifacts_dir, 'fact_cache') fact_cache_dir = os.path.join(artifacts_dir, 'fact_cache')
hosts_to_update = [] hosts_to_update = []
@@ -138,14 +145,25 @@ def finish_fact_cache(artifacts_dir, job_id=None, inventory_id=None, log_data=No
else: else:
# if the file goes missing, ansible removed it (likely via clear_facts) # if the file goes missing, ansible removed it (likely via clear_facts)
# if the file goes missing, but the host has not started facts, then we should not clear the facts # if the file goes missing, but the host has not started facts, then we should not clear the facts
host.ansible_facts = {} if job_created and host.ansible_facts_modified and host.ansible_facts_modified > job_created:
host.ansible_facts_modified = now() logger.warning(
hosts_to_update.append(host) f'Skipping fact clear for host {smart_str(host.name)} in job {job_id} '
logger.info(f'Facts cleared for inventory {smart_str(host.inventory.name)} host {smart_str(host.name)}') f'inventory {inventory_id}: host ansible_facts_modified '
log_data['cleared_ct'] += 1 f'({host.ansible_facts_modified.isoformat()}) is after this job\'s '
f'created time ({job_created.isoformat()}). '
f'A concurrent job likely updated this host\'s facts while this job was running.'
)
log_data['unmodified_ct'] += 1
else:
host.ansible_facts = {}
host.ansible_facts_modified = now()
hosts_to_update.append(host)
logger.info(f'Facts cleared for inventory {smart_str(host.inventory.name)} host {smart_str(host.name)}')
log_data['cleared_ct'] += 1
if len(hosts_to_update) >= 100: if len(hosts_to_update) >= 100:
bulk_update_sorted_by_id(Host, hosts_to_update, fields=['ansible_facts', 'ansible_facts_modified']) bulk_update_sorted_by_id(Host, hosts_to_update, fields=['ansible_facts', 'ansible_facts_modified'])
hosts_to_update = [] hosts_to_update = []
bulk_update_sorted_by_id(Host, hosts_to_update, fields=['ansible_facts', 'ansible_facts_modified']) bulk_update_sorted_by_id(Host, hosts_to_update, fields=['ansible_facts', 'ansible_facts_modified'])
logger.debug(f'Updated {log_data["updated_ct"]} host facts for inventory {inventory_id} in job {job_id}')

View File

@@ -1302,10 +1302,16 @@ class RunJob(SourceControlMixin, BaseTask):
return return
if self.should_use_fact_cache() and self.runner_callback.artifacts_processed: if self.should_use_fact_cache() and self.runner_callback.artifacts_processed:
job.log_lifecycle("finish_job_fact_cache") job.log_lifecycle("finish_job_fact_cache")
if job.inventory.kind == 'constructed':
hosts_qs = job.get_source_hosts_for_constructed_inventory()
else:
hosts_qs = job.inventory.hosts
finish_fact_cache( finish_fact_cache(
hosts_qs,
artifacts_dir=os.path.join(private_data_dir, 'artifacts', str(job.id)), artifacts_dir=os.path.join(private_data_dir, 'artifacts', str(job.id)),
job_id=job.id, job_id=job.id,
inventory_id=job.inventory_id, inventory_id=job.inventory_id,
job_created=job.created,
) )
def final_run_hook(self, job, status, private_data_dir): def final_run_hook(self, job, status, private_data_dir):

View File

@@ -0,0 +1,21 @@
---
# Generated by Claude Opus 4.6 (claude-opus-4-6).
- hosts: all
vars:
extra_value: ""
gather_facts: false
connection: local
tasks:
- name: set a custom fact
set_fact:
foo: "bar{{ extra_value }}"
bar:
a:
b:
- "c"
- "d"
cacheable: true
- name: sleep to create overlap window for concurrent job testing
wait_for:
timeout: 2

View File

@@ -1,6 +1,6 @@
import pytest import pytest
from awx.main.models import JobTemplate, Job, JobHostSummary, WorkflowJob, Inventory, Project, Organization from awx.main.models import JobTemplate, Job, JobHostSummary, WorkflowJob, Inventory, Host, Project, Organization
@pytest.mark.django_db @pytest.mark.django_db
@@ -87,3 +87,47 @@ class TestSlicingModels:
unified_job = job_template.create_unified_job(job_slice_count=2) unified_job = job_template.create_unified_job(job_slice_count=2)
assert isinstance(unified_job, Job) assert isinstance(unified_job, Job)
@pytest.mark.django_db
class TestGetSourceHostsForConstructedInventory:
"""Tests for Job.get_source_hosts_for_constructed_inventory"""
def test_returns_source_hosts_via_instance_id(self):
"""Constructed hosts with instance_id pointing to source hosts are resolved correctly."""
org = Organization.objects.create(name='test-org')
inv_input = Inventory.objects.create(organization=org, name='input-inv')
source_host1 = inv_input.hosts.create(name='host1')
source_host2 = inv_input.hosts.create(name='host2')
inv_constructed = Inventory.objects.create(organization=org, name='constructed-inv', kind='constructed')
inv_constructed.input_inventories.add(inv_input)
Host.objects.create(inventory=inv_constructed, name='host1', instance_id=str(source_host1.id))
Host.objects.create(inventory=inv_constructed, name='host2', instance_id=str(source_host2.id))
job = Job.objects.create(name='test-job', inventory=inv_constructed)
result = job.get_source_hosts_for_constructed_inventory()
assert set(result.values_list('id', flat=True)) == {source_host1.id, source_host2.id}
def test_no_inventory_returns_empty(self):
"""A job with no inventory returns an empty queryset."""
job = Job.objects.create(name='test-job')
result = job.get_source_hosts_for_constructed_inventory()
assert result.count() == 0
def test_ignores_hosts_without_instance_id(self):
"""Hosts with empty instance_id are excluded from the result."""
org = Organization.objects.create(name='test-org')
inv_input = Inventory.objects.create(organization=org, name='input-inv')
source_host = inv_input.hosts.create(name='host1')
inv_constructed = Inventory.objects.create(organization=org, name='constructed-inv', kind='constructed')
inv_constructed.input_inventories.add(inv_input)
Host.objects.create(inventory=inv_constructed, name='host1', instance_id=str(source_host.id))
Host.objects.create(inventory=inv_constructed, name='host-no-ref', instance_id='')
job = Job.objects.create(name='test-job', inventory=inv_constructed)
result = job.get_source_hosts_for_constructed_inventory()
assert list(result.values_list('id', flat=True)) == [source_host.id]

View File

@@ -0,0 +1,188 @@
"""Functional tests for start_fact_cache / finish_fact_cache.
These tests use real database objects (via pytest-django) and real files
on disk, but do not launch jobs or subprocesses. Fact files are written
by start_fact_cache and then manipulated to simulate ansible output
before calling finish_fact_cache.
Generated by Claude Opus 4.6 (claude-opus-4-6).
"""
import json
import os
import time
from datetime import timedelta
import pytest
from django.utils.timezone import now
from awx.main.models import Host, Inventory
from awx.main.tasks.facts import start_fact_cache, finish_fact_cache
@pytest.fixture
def artifacts_dir(tmp_path):
d = tmp_path / 'artifacts'
d.mkdir()
return str(d)
@pytest.mark.django_db
class TestFinishFactCacheScoping:
"""finish_fact_cache must only update hosts matched by the provided queryset."""
def test_same_hostname_different_inventories(self, organization, artifacts_dir):
"""Two inventories share a hostname; only the targeted one should be updated.
Generated by Claude Opus 4.6 (claude-opus-4-6).
"""
inv1 = Inventory.objects.create(organization=organization, name='scope-inv1')
inv2 = Inventory.objects.create(organization=organization, name='scope-inv2')
host1 = inv1.hosts.create(name='shared')
host2 = inv2.hosts.create(name='shared')
# Give both hosts initial facts
for h in (host1, host2):
h.ansible_facts = {'original': True}
h.ansible_facts_modified = now()
h.save(update_fields=['ansible_facts', 'ansible_facts_modified'])
# start_fact_cache writes reference files for inv1's hosts
start_fact_cache(inv1.hosts.all(), artifacts_dir=artifacts_dir, timeout=0, inventory_id=inv1.id)
# Simulate ansible writing new facts for 'shared'
fact_file = os.path.join(artifacts_dir, 'fact_cache', 'shared')
future = time.time() + 60
with open(fact_file, 'w') as f:
json.dump({'updated': True}, f)
os.utime(fact_file, (future, future))
# finish with inv1's hosts as the queryset
finish_fact_cache(inv1.hosts, artifacts_dir=artifacts_dir, inventory_id=inv1.id)
host1.refresh_from_db()
host2.refresh_from_db()
assert host1.ansible_facts == {'updated': True}
assert host2.ansible_facts == {'original': True}, 'Host in a different inventory was modified despite not being in the queryset'
@pytest.mark.django_db
class TestFinishFactCacheConcurrentProtection:
"""finish_fact_cache must not clear facts that a concurrent job updated."""
def test_skip_clear_when_facts_modified_after_job_created(self, organization, artifacts_dir):
"""If a host's facts were updated after the job was created, do not clear them.
Generated by Claude Opus 4.6 (claude-opus-4-6).
"""
inv = Inventory.objects.create(organization=organization, name='concurrent-inv')
host = inv.hosts.create(name='target')
job_created = now() - timedelta(minutes=5)
# start_fact_cache writes a reference file (host has no facts yet → no file written)
start_fact_cache(inv.hosts.all(), artifacts_dir=artifacts_dir, timeout=0, inventory_id=inv.id)
# Simulate a concurrent job updating this host's facts AFTER our job was created
host.ansible_facts = {'from_concurrent_job': True}
host.ansible_facts_modified = now()
host.save(update_fields=['ansible_facts', 'ansible_facts_modified'])
# The fact file is missing (our job's ansible didn't target this host),
# which would normally trigger a clear. But the concurrent protection
# should skip it because ansible_facts_modified > job_created.
fact_file = os.path.join(artifacts_dir, 'fact_cache', host.name)
if os.path.exists(fact_file):
os.remove(fact_file)
finish_fact_cache(
inv.hosts,
artifacts_dir=artifacts_dir,
inventory_id=inv.id,
job_created=job_created,
)
host.refresh_from_db()
assert host.ansible_facts == {'from_concurrent_job': True}, 'Facts set by a concurrent job were cleared despite ansible_facts_modified > job_created'
def test_clear_when_facts_predate_job(self, organization, artifacts_dir):
"""If facts predate the job, a missing file should still clear them.
Generated by Claude Opus 4.6 (claude-opus-4-6).
"""
inv = Inventory.objects.create(organization=organization, name='clear-inv')
host = inv.hosts.create(name='stale')
old_time = now() - timedelta(hours=1)
host.ansible_facts = {'stale': True}
host.ansible_facts_modified = old_time
host.save(update_fields=['ansible_facts', 'ansible_facts_modified'])
job_created = now() - timedelta(minutes=5)
start_fact_cache(inv.hosts.all(), artifacts_dir=artifacts_dir, timeout=0, inventory_id=inv.id)
# Remove the fact file to simulate ansible's clear_facts
os.remove(os.path.join(artifacts_dir, 'fact_cache', host.name))
finish_fact_cache(
inv.hosts,
artifacts_dir=artifacts_dir,
inventory_id=inv.id,
job_created=job_created,
)
host.refresh_from_db()
assert host.ansible_facts == {}, 'Stale facts should have been cleared when the fact file is missing ' 'and ansible_facts_modified predates job_created'
@pytest.mark.django_db
class TestConstructedInventoryFactCache:
"""finish_fact_cache with a constructed inventory queryset must target source hosts."""
def test_facts_resolve_to_source_host(self, organization, artifacts_dir):
"""Facts must be written to the source host, not the constructed copy.
Generated by Claude Opus 4.6 (claude-opus-4-6).
"""
from django.db.models.functions import Cast
inv_input = Inventory.objects.create(organization=organization, name='ci-input')
source_host = inv_input.hosts.create(name='webserver')
inv_constructed = Inventory.objects.create(organization=organization, name='ci-constructed', kind='constructed')
inv_constructed.input_inventories.add(inv_input)
constructed_host = Host.objects.create(
inventory=inv_constructed,
name='webserver',
instance_id=str(source_host.id),
)
# Build the same queryset that get_hosts_for_fact_cache uses
id_field = Host._meta.get_field('id')
source_qs = Host.objects.filter(id__in=inv_constructed.hosts.exclude(instance_id='').values_list(Cast('instance_id', output_field=id_field)))
# Give the source host initial facts so start_fact_cache writes a file
source_host.ansible_facts = {'role': 'web'}
source_host.ansible_facts_modified = now()
source_host.save(update_fields=['ansible_facts', 'ansible_facts_modified'])
start_fact_cache(source_qs, artifacts_dir=artifacts_dir, timeout=0, inventory_id=inv_constructed.id)
# Simulate ansible writing updated facts
fact_file = os.path.join(artifacts_dir, 'fact_cache', 'webserver')
future = time.time() + 60
with open(fact_file, 'w') as f:
json.dump({'role': 'web', 'deployed': True}, f)
os.utime(fact_file, (future, future))
finish_fact_cache(source_qs, artifacts_dir=artifacts_dir, inventory_id=inv_constructed.id)
source_host.refresh_from_db()
constructed_host.refresh_from_db()
assert source_host.ansible_facts == {'role': 'web', 'deployed': True}
assert not constructed_host.ansible_facts, f'Facts were stored on the constructed host: {constructed_host.ansible_facts!r}'

View File

@@ -0,0 +1,351 @@
"""Tests for concurrent fact caching with --limit.
Reproduces bugs where concurrent jobs targeting different hosts via --limit
incorrectly modify (clear or revert) facts for hosts outside their limit.
Customer report: concurrent jobs on the same job template with different limits
cause facts set by an earlier-finishing job to be rolled back when the
later-finishing job completes.
See: https://github.com/jritter/concurrent-aap-fact-caching
Generated by Claude Opus 4.6 (claude-opus-4-6).
"""
import logging
import pytest
from django.utils.timezone import now
from awx.api.versioning import reverse
from awx.main.models import Inventory, JobTemplate
from awx.main.tests.live.tests.conftest import wait_for_job, wait_to_leave_status
logger = logging.getLogger(__name__)
@pytest.fixture
def concurrent_facts_inventory(default_org):
"""Inventory with two hosts for concurrent fact cache testing."""
inv_name = 'test_concurrent_fact_cache'
Inventory.objects.filter(organization=default_org, name=inv_name).delete()
inv = Inventory.objects.create(organization=default_org, name=inv_name)
inv.hosts.create(name='cc_host_0')
inv.hosts.create(name='cc_host_1')
return inv
@pytest.fixture
def concurrent_facts_jt(concurrent_facts_inventory, live_tmp_folder, post, admin, project_factory):
"""Job template configured for concurrent fact-cached runs."""
proj = project_factory(scm_url=f'file://{live_tmp_folder}/facts')
if proj.current_job:
wait_for_job(proj.current_job)
assert 'gather_slow.yml' in proj.playbooks, f'gather_slow.yml not in {proj.playbooks}'
jt_name = 'test_concurrent_fact_cache JT'
existing_jt = JobTemplate.objects.filter(name=jt_name).first()
if existing_jt:
existing_jt.delete()
result = post(
reverse('api:job_template_list'),
{
'name': jt_name,
'project': proj.id,
'playbook': 'gather_slow.yml',
'inventory': concurrent_facts_inventory.id,
'use_fact_cache': True,
'allow_simultaneous': True,
},
admin,
expect=201,
)
return JobTemplate.objects.get(id=result.data['id'])
def test_concurrent_limit_does_not_clear_facts(concurrent_facts_inventory, concurrent_facts_jt):
"""Concurrent jobs with different --limit must not clear each other's facts.
Generated by Claude Opus 4.6 (claude-opus-4-6).
Scenario:
- Inventory has cc_host_0 and cc_host_1, neither has prior facts
- Job A runs gather_slow.yml with limit=cc_host_0
- While Job A is still running (sleeping), Job B launches with limit=cc_host_1
- Both jobs set cacheable facts, but only for their respective limited host
- After both complete, BOTH hosts should have populated facts
The bug: get_hosts_for_fact_cache() returns ALL inventory hosts regardless
of --limit. start_fact_cache records them all in hosts_cached but writes
no fact files (no prior facts). When the later-finishing job runs
finish_fact_cache, it sees a missing fact file for the other job's host
and clears that host's facts.
"""
inv = concurrent_facts_inventory
jt = concurrent_facts_jt
# Launch Job A targeting cc_host_0
job_a = jt.create_unified_job()
job_a.limit = 'cc_host_0'
job_a.save(update_fields=['limit'])
job_a.signal_start()
# Wait for Job A to reach running (it will sleep inside the playbook)
wait_to_leave_status(job_a, 'pending')
wait_to_leave_status(job_a, 'waiting')
logger.info(f'Job A (id={job_a.id}) is now running with limit=cc_host_0')
# Launch Job B targeting cc_host_1 while Job A is still running
job_b = jt.create_unified_job()
job_b.limit = 'cc_host_1'
job_b.save(update_fields=['limit'])
job_b.signal_start()
# Verify that Job A is still running when Job B starts,
# otherwise the overlap that triggers the bug did not happen.
wait_to_leave_status(job_b, 'pending')
wait_to_leave_status(job_b, 'waiting')
job_a.refresh_from_db()
if job_a.status != 'running':
pytest.skip('Job A finished before Job B started running; overlap did not occur')
logger.info(f'Job B (id={job_b.id}) is now running with limit=cc_host_1 (concurrent with Job A)')
# Wait for both to complete
wait_for_job(job_a)
wait_for_job(job_b)
# Verify facts survived concurrent execution
host_0 = inv.hosts.get(name='cc_host_0')
host_1 = inv.hosts.get(name='cc_host_1')
# sanity
job_a.refresh_from_db()
job_b.refresh_from_db()
assert job_a.limit == "cc_host_0"
assert job_b.limit == "cc_host_1"
discovered_foos = [host_0.ansible_facts.get('foo'), host_1.ansible_facts.get('foo')]
assert discovered_foos == ['bar'] * 2, f'Unexpected facts on cc_host_0 or _1: {discovered_foos} after job a,b {job_a.id}, {job_b.id}'
def test_concurrent_limit_does_not_revert_facts(live_tmp_folder, run_job_from_playbook, concurrent_facts_inventory):
"""Concurrent jobs must not revert facts that a prior concurrent job just set.
Generated by Claude Opus 4.6 (claude-opus-4-6).
Scenario:
- First, populate both hosts with initial facts (foo=bar) via a
non-concurrent gather run
- Then run two concurrent jobs with different limits, each setting
a new value (foo=bar_v2 via extra_vars)
- After both complete, BOTH hosts should have foo=bar_v2
The bug: start_fact_cache writes the OLD facts (foo=bar) into each job's
artifact dir for ALL hosts. If ansible's cache plugin rewrites a non-limited
host's fact file with the stale content (updating the mtime), finish_fact_cache
treats it as a legitimate update and overwrites the DB with old values.
"""
# --- Seed both hosts with initial facts via a non-concurrent run ---
inv = concurrent_facts_inventory
scm_url = f'file://{live_tmp_folder}/facts'
res = run_job_from_playbook(
'seed_facts_for_revert_test',
'gather_slow.yml',
scm_url=scm_url,
jt_params={'use_fact_cache': True, 'allow_simultaneous': True, 'inventory': inv.id},
)
for host in inv.hosts.all():
assert host.ansible_facts.get('foo') == 'bar', f'Seed run failed to set facts on {host.name}: {host.ansible_facts}'
job = res['job']
wait_for_job(job)
# sanity, jobs should be set up to both have facts with just bar
host_0 = inv.hosts.get(name='cc_host_0')
host_1 = inv.hosts.get(name='cc_host_1')
discovered_foos = [host_0.ansible_facts.get('foo'), host_1.ansible_facts.get('foo')]
assert discovered_foos == ['bar'] * 2, f'Facts did not get expected initial values: {discovered_foos}'
jt = job.job_template
assert jt.allow_simultaneous is True
assert jt.use_fact_cache is True
# Sanity assertion, sometimes this would give problems from the Django rel cache
assert jt.project
# --- Run two concurrent jobs that write a new value ---
# Update the JT to pass extra_vars that change the fact value
jt.extra_vars = '{"extra_value": "_v2"}'
jt.save(update_fields=['extra_vars'])
job_a = jt.create_unified_job()
job_a.limit = 'cc_host_0'
job_a.save(update_fields=['limit'])
job_a.signal_start()
wait_to_leave_status(job_a, 'pending')
wait_to_leave_status(job_a, 'waiting')
job_b = jt.create_unified_job()
job_b.limit = 'cc_host_1'
job_b.save(update_fields=['limit'])
job_b.signal_start()
wait_to_leave_status(job_b, 'pending')
wait_to_leave_status(job_b, 'waiting')
job_a.refresh_from_db()
if job_a.status != 'running':
pytest.skip('Job A finished before Job B started running; overlap did not occur')
wait_for_job(job_a)
wait_for_job(job_b)
host_0 = inv.hosts.get(name='cc_host_0')
host_1 = inv.hosts.get(name='cc_host_1')
# Both hosts should have the UPDATED value, not the old seed value
discovered_foos = [host_0.ansible_facts.get('foo'), host_1.ansible_facts.get('foo')]
assert discovered_foos == ['bar_v2'] * 2, f'Facts were reverted to stale values by concurrent job cc_host_0 or cc_host_1: {discovered_foos}'
def test_fact_cache_scoped_to_inventory(live_tmp_folder, default_org, run_job_from_playbook):
"""finish_fact_cache must not modify hosts in other inventories.
Generated by Claude Opus 4.6 (claude-opus-4-6).
Bug: finish_fact_cache queries Host.objects.filter(name__in=host_names)
without an inventory_id filter, so hosts with the same name in different
inventories get their facts cross-contaminated.
"""
shared_name = 'scope_shared_host'
# Prepare for test by deleting junk from last run
for inv_name in ('test_fact_scope_inv1', 'test_fact_scope_inv2'):
inv = Inventory.objects.filter(name=inv_name).first()
if inv:
inv.delete()
inv1 = Inventory.objects.create(organization=default_org, name='test_fact_scope_inv1')
inv1.hosts.create(name=shared_name)
inv2 = Inventory.objects.create(organization=default_org, name='test_fact_scope_inv2')
host2 = inv2.hosts.create(name=shared_name)
# Give inv2's host distinct facts that should not be touched
original_facts = {'source': 'inventory_2', 'untouched': True}
host2.ansible_facts = original_facts
host2.ansible_facts_modified = now()
host2.save(update_fields=['ansible_facts', 'ansible_facts_modified'])
# Run a fact-gathering job against inv1 only
run_job_from_playbook(
'test_fact_scope',
'gather.yml',
scm_url=f'file://{live_tmp_folder}/facts',
jt_params={'use_fact_cache': True, 'inventory': inv1.id},
)
# inv1's host should have facts
host1 = inv1.hosts.get(name=shared_name)
assert host1.ansible_facts, f'inv1 host should have facts after gather: {host1.ansible_facts}'
# inv2's host must NOT have been touched
host2.refresh_from_db()
assert host2.ansible_facts == original_facts, (
f'Host in a different inventory was modified by a fact cache operation '
f'on another inventory sharing the same hostname. '
f'Expected {original_facts!r}, got {host2.ansible_facts!r}'
)
def test_constructed_inventory_facts_saved_to_source_host(live_tmp_folder, default_org, run_job_from_playbook):
"""Facts from a constructed inventory job must be saved to the source host.
Generated by Claude Opus 4.6 (claude-opus-4-6).
Constructed inventories contain hosts that are references (via instance_id)
to 'real' hosts in input inventories. start_fact_cache correctly resolves
source hosts via get_hosts_for_fact_cache(), but finish_fact_cache must also
write facts back to the source hosts, not the constructed inventory's copies.
Scenario:
- Two input inventories each have a host named 'ci_shared_host'
- A constructed inventory uses both as inputs
- The inventory sync picks one source host (via instance_id) for the
constructed host — which one depends on input processing order
- Both source hosts start with distinct pre-existing facts
- A fact-gathering job runs against the constructed inventory
- After completion, the targeted source host should have the job's facts
- The OTHER source host must retain its original facts untouched
- The constructed host itself must NOT have facts stored on it
(constructed hosts are transient — recreated on each inventory sync)
"""
shared_name = 'ci_shared_host'
# Cleanup from prior runs
for inv_name in ('test_ci_facts_input1', 'test_ci_facts_input2', 'test_ci_facts_constructed'):
Inventory.objects.filter(name=inv_name).delete()
# --- Create two input inventories, each with an identically-named host ---
inv1 = Inventory.objects.create(organization=default_org, name='test_ci_facts_input1')
source_host1 = inv1.hosts.create(name=shared_name)
inv2 = Inventory.objects.create(organization=default_org, name='test_ci_facts_input2')
source_host2 = inv2.hosts.create(name=shared_name)
# Give both hosts distinct pre-existing facts so we can detect cross-contamination
host1_original_facts = {'source': 'inventory_1'}
source_host1.ansible_facts = host1_original_facts
source_host1.ansible_facts_modified = now()
source_host1.save(update_fields=['ansible_facts', 'ansible_facts_modified'])
host2_original_facts = {'source': 'inventory_2'}
source_host2.ansible_facts = host2_original_facts
source_host2.ansible_facts_modified = now()
source_host2.save(update_fields=['ansible_facts', 'ansible_facts_modified'])
source_hosts_by_id = {source_host1.id: source_host1, source_host2.id: source_host2}
original_facts_by_id = {source_host1.id: host1_original_facts, source_host2.id: host2_original_facts}
# --- Create constructed inventory (sync will create hosts from inputs) ---
constructed_inv = Inventory.objects.create(
organization=default_org,
name='test_ci_facts_constructed',
kind='constructed',
)
constructed_inv.input_inventories.add(inv1)
constructed_inv.input_inventories.add(inv2)
# --- Run a fact-gathering job against the constructed inventory ---
# The job launch triggers an inventory sync which creates the constructed
# host with instance_id pointing to one of the source hosts.
scm_url = f'file://{live_tmp_folder}/facts'
run_job_from_playbook(
'test_ci_facts',
'gather.yml',
scm_url=scm_url,
jt_params={'use_fact_cache': True, 'inventory': constructed_inv.id},
)
# --- Determine which source host the constructed host points to ---
constructed_host = constructed_inv.hosts.get(name=shared_name)
target_id = int(constructed_host.instance_id)
other_id = (set(source_hosts_by_id.keys()) - {target_id}).pop()
target_host = source_hosts_by_id[target_id]
other_host = source_hosts_by_id[other_id]
target_host.refresh_from_db()
other_host.refresh_from_db()
constructed_host.refresh_from_db()
actual = [target_host.ansible_facts.get('foo'), other_host.ansible_facts, constructed_host.ansible_facts]
expected = ['bar', original_facts_by_id[other_id], {}]
assert actual == expected, (
f'Constructed inventory fact cache wrote to wrong host(s). '
f'target source host (id={target_id}) foo={actual[0]!r}, '
f'other source host (id={other_id}) facts={actual[1]!r}, '
f'constructed host facts={actual[2]!r}; expected {expected!r}'
)

View File

@@ -2,6 +2,7 @@
import json import json
import os import os
import pytest import pytest
from unittest import mock
from awx.main.models import ( from awx.main.models import (
Inventory, Inventory,
@@ -99,52 +100,55 @@ def test_start_job_fact_cache_within_timeout(hosts, tmpdir):
def test_finish_job_fact_cache_clear(hosts, mocker, ref_time, tmpdir): def test_finish_job_fact_cache_clear(hosts, mocker, ref_time, tmpdir):
fact_cache = os.path.join(tmpdir, 'facts') artifacts_dir = str(tmpdir.mkdir("artifacts"))
start_fact_cache(hosts, fact_cache, timeout=0) inventory_id = 5
bulk_update = mocker.patch('awx.main.tasks.facts.bulk_update_sorted_by_id') start_fact_cache(hosts, artifacts_dir=artifacts_dir, timeout=0, inventory_id=inventory_id)
# Mock the os.path.exists behavior for host deletion mocker.patch('awx.main.tasks.facts.bulk_update_sorted_by_id')
# Let's assume the fact file for hosts[1] is missing.
mocker.patch('os.path.exists', side_effect=lambda path: hosts[1].name not in path)
# Simulate one host's fact file getting deleted manually # Remove the fact file for hosts[1] to simulate ansible's clear_facts
host_to_delete_filepath = os.path.join(fact_cache, hosts[1].name) fact_cache_dir = os.path.join(artifacts_dir, 'fact_cache')
os.remove(os.path.join(fact_cache_dir, hosts[1].name))
# Simulate the file being removed by checking existence first, to avoid FileNotFoundError hosts_qs = mock.MagicMock()
if os.path.exists(host_to_delete_filepath): hosts_qs.filter.return_value.order_by.return_value.iterator.return_value = iter(hosts)
os.remove(host_to_delete_filepath)
finish_fact_cache(fact_cache) finish_fact_cache(hosts_qs, artifacts_dir=artifacts_dir, inventory_id=inventory_id)
# Simulate side effects that would normally be applied during bulk update # hosts[1] should have had its facts cleared (file was missing, job_created=None)
hosts[1].ansible_facts = {} assert hosts[1].ansible_facts == {}
hosts[1].ansible_facts_modified = now() assert hosts[1].ansible_facts_modified > ref_time
# Verify facts are preserved for hosts with valid cache files # Other hosts should be unmodified (fact files exist but weren't changed by ansible)
for host in (hosts[0], hosts[2], hosts[3]): for host in (hosts[0], hosts[2], hosts[3]):
assert host.ansible_facts == {"a": 1, "b": 2} assert host.ansible_facts == {"a": 1, "b": 2}
assert host.ansible_facts_modified == ref_time assert host.ansible_facts_modified == ref_time
assert hosts[1].ansible_facts_modified > ref_time
# Current implementation skips the call entirely if hosts_to_update == []
bulk_update.assert_not_called()
def test_finish_job_fact_cache_with_bad_data(hosts, mocker, tmpdir): def test_finish_job_fact_cache_with_bad_data(hosts, mocker, tmpdir):
fact_cache = os.path.join(tmpdir, 'facts') artifacts_dir = str(tmpdir.mkdir("artifacts"))
start_fact_cache(hosts, fact_cache, timeout=0) inventory_id = 5
bulk_update = mocker.patch('django.db.models.query.QuerySet.bulk_update') start_fact_cache(hosts, artifacts_dir=artifacts_dir, timeout=0, inventory_id=inventory_id)
bulk_update = mocker.patch('awx.main.tasks.facts.bulk_update_sorted_by_id')
# Overwrite fact files with invalid JSON and set future mtime
fact_cache_dir = os.path.join(artifacts_dir, 'fact_cache')
for h in hosts: for h in hosts:
filepath = os.path.join(fact_cache, h.name) filepath = os.path.join(fact_cache_dir, h.name)
with open(filepath, 'w') as f: with open(filepath, 'w') as f:
f.write('not valid json!') f.write('not valid json!')
f.flush() f.flush()
new_modification_time = time.time() + 3600 new_modification_time = time.time() + 3600
os.utime(filepath, (new_modification_time, new_modification_time)) os.utime(filepath, (new_modification_time, new_modification_time))
finish_fact_cache(fact_cache) hosts_qs = mock.MagicMock()
hosts_qs.filter.return_value.order_by.return_value.iterator.return_value = iter(hosts)
bulk_update.assert_not_called() finish_fact_cache(hosts_qs, artifacts_dir=artifacts_dir, inventory_id=inventory_id)
# Invalid JSON should be skipped — no hosts updated
updated_hosts = bulk_update.call_args[0][1]
assert updated_hosts == []

View File

@@ -84,7 +84,7 @@ def job_template_with_credentials():
@mock.patch('awx.main.tasks.jobs.create_partition', return_value=True) @mock.patch('awx.main.tasks.jobs.create_partition', return_value=True)
def test_pre_post_run_hook_facts(mock_create_partition, mock_facts_settings, private_data_dir, execution_environment): def test_pre_post_run_hook_facts(mock_create_partition, mock_facts_settings, private_data_dir, execution_environment):
# Create mocked inventory and host queryset # Create mocked inventory and host queryset
inventory = mock.MagicMock(spec=Inventory, pk=1) inventory = mock.MagicMock(spec=Inventory, pk=1, kind='')
host1 = mock.MagicMock(spec=Host, id=1, name='host1', ansible_facts={"a": 1, "b": 2}, ansible_facts_modified=now(), inventory=inventory) host1 = mock.MagicMock(spec=Host, id=1, name='host1', ansible_facts={"a": 1, "b": 2}, ansible_facts_modified=now(), inventory=inventory)
host2 = mock.MagicMock(spec=Host, id=2, name='host2', ansible_facts={"a": 1, "b": 2}, ansible_facts_modified=now(), inventory=inventory) host2 = mock.MagicMock(spec=Host, id=2, name='host2', ansible_facts={"a": 1, "b": 2}, ansible_facts_modified=now(), inventory=inventory)
@@ -107,6 +107,8 @@ def test_pre_post_run_hook_facts(mock_create_partition, mock_facts_settings, pri
job_slice_number=1, job_slice_number=1,
job_slice_count=1, job_slice_count=1,
inventory=inventory, inventory=inventory,
inventory_id=inventory.pk,
created=now(),
execution_environment=execution_environment, execution_environment=execution_environment,
) )
job.get_hosts_for_fact_cache = Job.get_hosts_for_fact_cache.__get__(job) job.get_hosts_for_fact_cache = Job.get_hosts_for_fact_cache.__get__(job)
@@ -140,7 +142,7 @@ def test_pre_post_run_hook_facts(mock_create_partition, mock_facts_settings, pri
@mock.patch('awx.main.tasks.jobs.create_partition', return_value=True) @mock.patch('awx.main.tasks.jobs.create_partition', return_value=True)
def test_pre_post_run_hook_facts_deleted_sliced(mock_create_partition, mock_facts_settings, private_data_dir, execution_environment): def test_pre_post_run_hook_facts_deleted_sliced(mock_create_partition, mock_facts_settings, private_data_dir, execution_environment):
# Fully mocked inventory # Fully mocked inventory
mock_inventory = mock.MagicMock(spec=Inventory) mock_inventory = mock.MagicMock(spec=Inventory, pk=1, kind='')
# Create 999 mocked Host instances # Create 999 mocked Host instances
hosts = [] hosts = []
@@ -173,6 +175,8 @@ def test_pre_post_run_hook_facts_deleted_sliced(mock_create_partition, mock_fact
job.job_slice_count = 3 job.job_slice_count = 3
job.execution_environment = execution_environment job.execution_environment = execution_environment
job.inventory = mock_inventory job.inventory = mock_inventory
job.inventory_id = mock_inventory.pk
job.created = now()
job.job_env.get.return_value = private_data_dir job.job_env.get.return_value = private_data_dir
# Bind actual method for host filtering # Bind actual method for host filtering