Record whether a file was written for fact cache (#16361)

This commit is contained in:
Alan Rominger
2026-03-20 12:53:34 -04:00
committed by GitHub
parent ff68d6196d
commit 377dfce197
2 changed files with 60 additions and 13 deletions

View File

@@ -25,7 +25,8 @@ def start_fact_cache(hosts, artifacts_dir, timeout=None, inventory_id=None, log_
log_data = log_data or {} log_data = log_data or {}
log_data['inventory_id'] = inventory_id log_data['inventory_id'] = inventory_id
log_data['written_ct'] = 0 log_data['written_ct'] = 0
hosts_cached = [] # Dict mapping host name -> bool (True if a fact file was written)
hosts_cached = {}
# Create the fact_cache directory inside artifacts_dir # Create the fact_cache directory inside artifacts_dir
fact_cache_dir = os.path.join(artifacts_dir, 'fact_cache') fact_cache_dir = os.path.join(artifacts_dir, 'fact_cache')
@@ -37,13 +38,14 @@ def start_fact_cache(hosts, artifacts_dir, timeout=None, inventory_id=None, log_
last_write_time = None last_write_time = None
for host in hosts: for host in hosts:
hosts_cached.append(host.name)
if not host.ansible_facts_modified or (timeout and host.ansible_facts_modified < now() - datetime.timedelta(seconds=timeout)): if not host.ansible_facts_modified or (timeout and host.ansible_facts_modified < now() - datetime.timedelta(seconds=timeout)):
hosts_cached[host.name] = False
continue # facts are expired - do not write them continue # facts are expired - do not write them
filepath = os.path.join(fact_cache_dir, host.name) filepath = os.path.join(fact_cache_dir, host.name)
if not os.path.realpath(filepath).startswith(fact_cache_dir): if not os.path.realpath(filepath).startswith(fact_cache_dir):
logger.error(f'facts for host {smart_str(host.name)} could not be cached') logger.error(f'facts for host {smart_str(host.name)} could not be cached')
hosts_cached[host.name] = False
continue continue
try: try:
@@ -59,8 +61,10 @@ def start_fact_cache(hosts, artifacts_dir, timeout=None, inventory_id=None, log_
backdated = mtime - 2 backdated = mtime - 2
os.utime(filepath, (backdated, backdated)) os.utime(filepath, (backdated, backdated))
last_write_time = backdated last_write_time = backdated
hosts_cached[host.name] = True
except IOError: except IOError:
logger.error(f'facts for host {smart_str(host.name)} could not be cached') logger.error(f'facts for host {smart_str(host.name)} could not be cached')
hosts_cached[host.name] = False
continue continue
# Write summary file directly to the artifacts_dir # Write summary file directly to the artifacts_dir
@@ -69,7 +73,6 @@ def start_fact_cache(hosts, artifacts_dir, timeout=None, inventory_id=None, log_
summary_data = { summary_data = {
'last_write_time': last_write_time, 'last_write_time': last_write_time,
'hosts_cached': hosts_cached, 'hosts_cached': hosts_cached,
'written_ct': log_data['written_ct'],
} }
with open(summary_file, 'w', encoding='utf-8') as f: with open(summary_file, 'w', encoding='utf-8') as f:
json.dump(summary_data, f, indent=2) json.dump(summary_data, f, indent=2)
@@ -101,7 +104,8 @@ def finish_fact_cache(host_qs, artifacts_dir, job_id=None, inventory_id=None, jo
logger.error(f'Error reading summary file at {summary_path}: {e}') logger.error(f'Error reading summary file at {summary_path}: {e}')
return return
host_names = summary.get('hosts_cached', []) hosts_cached_map = summary.get('hosts_cached', {})
host_names = list(hosts_cached_map.keys())
hosts_cached = host_qs.filter(name__in=host_names).order_by('id').iterator() hosts_cached = host_qs.filter(name__in=host_names).order_by('id').iterator()
# Path where individual fact files were written # Path where individual fact files were written
fact_cache_dir = os.path.join(artifacts_dir, 'fact_cache') fact_cache_dir = os.path.join(artifacts_dir, 'fact_cache')
@@ -143,6 +147,14 @@ def finish_fact_cache(host_qs, artifacts_dir, job_id=None, inventory_id=None, jo
else: else:
log_data['unmodified_ct'] += 1 log_data['unmodified_ct'] += 1
else: else:
# File is missing. Only interpret this as "ansible cleared facts" if
# start_fact_cache actually wrote a file for this host (i.e. the host
# had valid, non-expired facts before the job ran). If no file was
# ever written, the missing file is expected and not a clear signal.
if not hosts_cached_map.get(host.name):
log_data['unmodified_ct'] += 1
continue
# if the file goes missing, ansible removed it (likely via clear_facts) # if the file goes missing, ansible removed it (likely via clear_facts)
# if the file goes missing, but the host has not started facts, then we should not clear the facts # if the file goes missing, but the host has not started facts, then we should not clear the facts
if job_created and host.ansible_facts_modified and host.ansible_facts_modified > job_created: if job_created and host.ansible_facts_modified and host.ansible_facts_modified > job_created:

View File

@@ -73,17 +73,21 @@ class TestFinishFactCacheScoping:
class TestFinishFactCacheConcurrentProtection: class TestFinishFactCacheConcurrentProtection:
"""finish_fact_cache must not clear facts that a concurrent job updated.""" """finish_fact_cache must not clear facts that a concurrent job updated."""
def test_skip_clear_when_facts_modified_after_job_created(self, organization, artifacts_dir): def test_no_clear_when_no_file_was_written(self, organization, artifacts_dir):
"""If a host's facts were updated after the job was created, do not clear them. """Host with no prior facts should not have facts cleared when file is missing.
Generated by Claude Opus 4.6 (claude-opus-4-6). Generated by Claude Opus 4.6 (claude-opus-4-6).
start_fact_cache records hosts_cached[host] = False for hosts with no
prior facts (no file written). finish_fact_cache should skip the clear
for these hosts because the missing file is expected, not a clear signal.
""" """
inv = Inventory.objects.create(organization=organization, name='concurrent-inv') inv = Inventory.objects.create(organization=organization, name='concurrent-inv')
host = inv.hosts.create(name='target') host = inv.hosts.create(name='target')
job_created = now() - timedelta(minutes=5) job_created = now() - timedelta(minutes=5)
# start_fact_cache writes a reference file (host has no facts yet → no file written) # start_fact_cache records host with False (no facts → no file written)
start_fact_cache(inv.hosts.all(), artifacts_dir=artifacts_dir, timeout=0, inventory_id=inv.id) start_fact_cache(inv.hosts.all(), artifacts_dir=artifacts_dir, timeout=0, inventory_id=inv.id)
# Simulate a concurrent job updating this host's facts AFTER our job was created # Simulate a concurrent job updating this host's facts AFTER our job was created
@@ -91,12 +95,43 @@ class TestFinishFactCacheConcurrentProtection:
host.ansible_facts_modified = now() host.ansible_facts_modified = now()
host.save(update_fields=['ansible_facts', 'ansible_facts_modified']) host.save(update_fields=['ansible_facts', 'ansible_facts_modified'])
# The fact file is missing (our job's ansible didn't target this host), # The fact file is missing because start_fact_cache never wrote one.
# which would normally trigger a clear. But the concurrent protection # finish_fact_cache should skip this host entirely.
# should skip it because ansible_facts_modified > job_created. finish_fact_cache(
fact_file = os.path.join(artifacts_dir, 'fact_cache', host.name) inv.hosts,
if os.path.exists(fact_file): artifacts_dir=artifacts_dir,
os.remove(fact_file) inventory_id=inv.id,
job_created=job_created,
)
host.refresh_from_db()
assert host.ansible_facts == {'from_concurrent_job': True}, 'Facts were cleared for a host that never had a fact file written'
def test_skip_clear_when_facts_modified_after_job_created(self, organization, artifacts_dir):
"""If a file was written and then deleted, but facts were concurrently updated, skip clear.
Generated by Claude Opus 4.6 (claude-opus-4-6).
"""
inv = Inventory.objects.create(organization=organization, name='concurrent-written-inv')
host = inv.hosts.create(name='target')
old_time = now() - timedelta(hours=1)
host.ansible_facts = {'original': True}
host.ansible_facts_modified = old_time
host.save(update_fields=['ansible_facts', 'ansible_facts_modified'])
job_created = now() - timedelta(minutes=5)
# start_fact_cache writes a file (host has facts → True in map)
start_fact_cache(inv.hosts.all(), artifacts_dir=artifacts_dir, timeout=0, inventory_id=inv.id)
# Remove the fact file (ansible didn't target this host via --limit)
os.remove(os.path.join(artifacts_dir, 'fact_cache', host.name))
# Simulate a concurrent job updating this host's facts AFTER our job was created
host.ansible_facts = {'from_concurrent_job': True}
host.ansible_facts_modified = now()
host.save(update_fields=['ansible_facts', 'ansible_facts_modified'])
finish_fact_cache( finish_fact_cache(
inv.hosts, inv.hosts,