diff --git a/awx/main/tasks/facts.py b/awx/main/tasks/facts.py index 8fbdae9019..48772e9c2c 100644 --- a/awx/main/tasks/facts.py +++ b/awx/main/tasks/facts.py @@ -25,7 +25,8 @@ def start_fact_cache(hosts, artifacts_dir, timeout=None, inventory_id=None, log_ log_data = log_data or {} log_data['inventory_id'] = inventory_id log_data['written_ct'] = 0 - hosts_cached = [] + # Dict mapping host name -> bool (True if a fact file was written) + hosts_cached = {} # Create the fact_cache directory inside artifacts_dir fact_cache_dir = os.path.join(artifacts_dir, 'fact_cache') @@ -37,13 +38,14 @@ def start_fact_cache(hosts, artifacts_dir, timeout=None, inventory_id=None, log_ last_write_time = None for host in hosts: - hosts_cached.append(host.name) if not host.ansible_facts_modified or (timeout and host.ansible_facts_modified < now() - datetime.timedelta(seconds=timeout)): + hosts_cached[host.name] = False continue # facts are expired - do not write them filepath = os.path.join(fact_cache_dir, host.name) if not os.path.realpath(filepath).startswith(fact_cache_dir): logger.error(f'facts for host {smart_str(host.name)} could not be cached') + hosts_cached[host.name] = False continue try: @@ -59,8 +61,10 @@ def start_fact_cache(hosts, artifacts_dir, timeout=None, inventory_id=None, log_ backdated = mtime - 2 os.utime(filepath, (backdated, backdated)) last_write_time = backdated + hosts_cached[host.name] = True except IOError: logger.error(f'facts for host {smart_str(host.name)} could not be cached') + hosts_cached[host.name] = False continue # Write summary file directly to the artifacts_dir @@ -69,7 +73,6 @@ def start_fact_cache(hosts, artifacts_dir, timeout=None, inventory_id=None, log_ summary_data = { 'last_write_time': last_write_time, 'hosts_cached': hosts_cached, - 'written_ct': log_data['written_ct'], } with open(summary_file, 'w', encoding='utf-8') as f: json.dump(summary_data, f, indent=2) @@ -101,7 +104,8 @@ def finish_fact_cache(host_qs, artifacts_dir, job_id=None, inventory_id=None, jo logger.error(f'Error reading summary file at {summary_path}: {e}') return - host_names = summary.get('hosts_cached', []) + hosts_cached_map = summary.get('hosts_cached', {}) + host_names = list(hosts_cached_map.keys()) hosts_cached = host_qs.filter(name__in=host_names).order_by('id').iterator() # Path where individual fact files were written fact_cache_dir = os.path.join(artifacts_dir, 'fact_cache') @@ -143,6 +147,14 @@ def finish_fact_cache(host_qs, artifacts_dir, job_id=None, inventory_id=None, jo else: log_data['unmodified_ct'] += 1 else: + # File is missing. Only interpret this as "ansible cleared facts" if + # start_fact_cache actually wrote a file for this host (i.e. the host + # had valid, non-expired facts before the job ran). If no file was + # ever written, the missing file is expected and not a clear signal. + if not hosts_cached_map.get(host.name): + log_data['unmodified_ct'] += 1 + continue + # if the file goes missing, ansible removed it (likely via clear_facts) # if the file goes missing, but the host has not started facts, then we should not clear the facts if job_created and host.ansible_facts_modified and host.ansible_facts_modified > job_created: diff --git a/awx/main/tests/functional/tasks/test_fact_cache.py b/awx/main/tests/functional/tasks/test_fact_cache.py index 1ca43da034..07e6a325ec 100644 --- a/awx/main/tests/functional/tasks/test_fact_cache.py +++ b/awx/main/tests/functional/tasks/test_fact_cache.py @@ -73,17 +73,21 @@ class TestFinishFactCacheScoping: class TestFinishFactCacheConcurrentProtection: """finish_fact_cache must not clear facts that a concurrent job updated.""" - def test_skip_clear_when_facts_modified_after_job_created(self, organization, artifacts_dir): - """If a host's facts were updated after the job was created, do not clear them. + def test_no_clear_when_no_file_was_written(self, organization, artifacts_dir): + """Host with no prior facts should not have facts cleared when file is missing. Generated by Claude Opus 4.6 (claude-opus-4-6). + + start_fact_cache records hosts_cached[host] = False for hosts with no + prior facts (no file written). finish_fact_cache should skip the clear + for these hosts because the missing file is expected, not a clear signal. """ inv = Inventory.objects.create(organization=organization, name='concurrent-inv') host = inv.hosts.create(name='target') job_created = now() - timedelta(minutes=5) - # start_fact_cache writes a reference file (host has no facts yet → no file written) + # start_fact_cache records host with False (no facts → no file written) start_fact_cache(inv.hosts.all(), artifacts_dir=artifacts_dir, timeout=0, inventory_id=inv.id) # Simulate a concurrent job updating this host's facts AFTER our job was created @@ -91,12 +95,43 @@ class TestFinishFactCacheConcurrentProtection: host.ansible_facts_modified = now() host.save(update_fields=['ansible_facts', 'ansible_facts_modified']) - # The fact file is missing (our job's ansible didn't target this host), - # which would normally trigger a clear. But the concurrent protection - # should skip it because ansible_facts_modified > job_created. - fact_file = os.path.join(artifacts_dir, 'fact_cache', host.name) - if os.path.exists(fact_file): - os.remove(fact_file) + # The fact file is missing because start_fact_cache never wrote one. + # finish_fact_cache should skip this host entirely. + finish_fact_cache( + inv.hosts, + artifacts_dir=artifacts_dir, + inventory_id=inv.id, + job_created=job_created, + ) + + host.refresh_from_db() + assert host.ansible_facts == {'from_concurrent_job': True}, 'Facts were cleared for a host that never had a fact file written' + + def test_skip_clear_when_facts_modified_after_job_created(self, organization, artifacts_dir): + """If a file was written and then deleted, but facts were concurrently updated, skip clear. + + Generated by Claude Opus 4.6 (claude-opus-4-6). + """ + inv = Inventory.objects.create(organization=organization, name='concurrent-written-inv') + host = inv.hosts.create(name='target') + + old_time = now() - timedelta(hours=1) + host.ansible_facts = {'original': True} + host.ansible_facts_modified = old_time + host.save(update_fields=['ansible_facts', 'ansible_facts_modified']) + + job_created = now() - timedelta(minutes=5) + + # start_fact_cache writes a file (host has facts → True in map) + start_fact_cache(inv.hosts.all(), artifacts_dir=artifacts_dir, timeout=0, inventory_id=inv.id) + + # Remove the fact file (ansible didn't target this host via --limit) + os.remove(os.path.join(artifacts_dir, 'fact_cache', host.name)) + + # Simulate a concurrent job updating this host's facts AFTER our job was created + host.ansible_facts = {'from_concurrent_job': True} + host.ansible_facts_modified = now() + host.save(update_fields=['ansible_facts', 'ansible_facts_modified']) finish_fact_cache( inv.hosts,