Add ee cleanup tests

* Adds cleanup tests to the live test.
This commit is contained in:
Chris Meyers 2025-01-21 14:38:36 -05:00 committed by Chris Meyers
parent 534c312328
commit ad706d67c2
5 changed files with 127 additions and 43 deletions

View File

@ -228,22 +228,24 @@ class RemoteJobError(RuntimeError):
pass
def run_until_complete(node, timing_data=None, **kwargs):
def run_until_complete(node, timing_data=None, worktype='ansible-runner', ttl='20s', **kwargs):
"""
Runs an ansible-runner work_type on remote node, waits until it completes, then returns stdout.
"""
config_data = read_receptor_config()
receptor_ctl = get_receptor_ctl(config_data)
use_stream_tls = getattr(get_conn_type(node, receptor_ctl), 'name', None) == "STREAMTLS"
kwargs.setdefault('tlsclient', get_tls_client(config_data, use_stream_tls))
kwargs.setdefault('ttl', '20s')
if ttl is not None:
kwargs['ttl'] = ttl
kwargs.setdefault('payload', '')
if work_signing_enabled(config_data):
kwargs['signwork'] = True
transmit_start = time.time()
result = receptor_ctl.submit_work(worktype='ansible-runner', node=node, **kwargs)
result = receptor_ctl.submit_work(worktype=worktype, node=node, **kwargs)
unit_id = result['unitid']
run_start = time.time()
@ -371,7 +373,7 @@ def _convert_args_to_cli(vargs):
return args
def worker_cleanup(node_name, vargs, timeout=300.0):
def worker_cleanup(node_name, vargs):
args = _convert_args_to_cli(vargs)
remote_command = ' '.join(args)

View File

@ -25,6 +25,7 @@ from django.utils.translation import gettext_lazy as _
from django.utils.translation import gettext_noop
from django.core.cache import cache
from django.core.exceptions import ObjectDoesNotExist
from django.db.models.query import QuerySet
# Django-CRUM
from crum import impersonate
@ -379,48 +380,68 @@ def purge_old_stdout_files():
logger.debug("Removing {}".format(os.path.join(settings.JOBOUTPUT_ROOT, f)))
def _cleanup_images_and_files(**kwargs):
if settings.IS_K8S:
return
this_inst = Instance.objects.me()
runner_cleanup_kwargs = this_inst.get_cleanup_task_kwargs(**kwargs)
if runner_cleanup_kwargs:
stdout = ''
with StringIO() as buffer:
with redirect_stdout(buffer):
ansible_runner.cleanup.run_cleanup(runner_cleanup_kwargs)
stdout = buffer.getvalue()
if '(changed: True)' in stdout:
logger.info(f'Performed local cleanup with kwargs {kwargs}, output:\n{stdout}')
class CleanupImagesAndFiles:
@classmethod
def get_first_control_instance(cls) -> Instance | None:
return (
Instance.objects.filter(node_type__in=['hybrid', 'control'], node_state=Instance.States.READY, enabled=True, capacity__gt=0)
.order_by('-hostname')
.first()
)
# if we are the first instance alphabetically, then run cleanup on execution nodes
checker_instance = (
Instance.objects.filter(node_type__in=['hybrid', 'control'], node_state=Instance.States.READY, enabled=True, capacity__gt=0)
.order_by('-hostname')
.first()
)
if checker_instance and this_inst.hostname == checker_instance.hostname:
for inst in Instance.objects.filter(node_type='execution', node_state=Instance.States.READY, enabled=True, capacity__gt=0):
runner_cleanup_kwargs = inst.get_cleanup_task_kwargs(**kwargs)
if not runner_cleanup_kwargs:
continue
try:
stdout = worker_cleanup(inst.hostname, runner_cleanup_kwargs)
if '(changed: True)' in stdout:
logger.info(f'Performed cleanup on execution node {inst.hostname} with output:\n{stdout}')
except RuntimeError:
logger.exception(f'Error running cleanup on execution node {inst.hostname}')
@classmethod
def get_execution_instances(cls) -> QuerySet[Instance]:
return Instance.objects.filter(node_type='execution', node_state=Instance.States.READY, enabled=True, capacity__gt=0)
@classmethod
def run_local(cls, this_inst: Instance, **kwargs):
if settings.IS_K8S:
return
runner_cleanup_kwargs = this_inst.get_cleanup_task_kwargs(**kwargs)
if runner_cleanup_kwargs:
stdout = ''
with StringIO() as buffer:
with redirect_stdout(buffer):
ansible_runner.cleanup.run_cleanup(runner_cleanup_kwargs)
stdout = buffer.getvalue()
if '(changed: True)' in stdout:
logger.info(f'Performed local cleanup with kwargs {kwargs}, output:\n{stdout}')
@classmethod
def run_remote(cls, this_inst: Instance, **kwargs):
# if we are the first instance alphabetically, then run cleanup on execution nodes
checker_instance = cls.get_first_control_instance()
if checker_instance and this_inst.hostname == checker_instance.hostname:
for inst in cls.get_execution_instances():
runner_cleanup_kwargs = inst.get_cleanup_task_kwargs(**kwargs)
if not runner_cleanup_kwargs:
continue
try:
stdout = worker_cleanup(inst.hostname, runner_cleanup_kwargs)
if '(changed: True)' in stdout:
logger.info(f'Performed cleanup on execution node {inst.hostname} with output:\n{stdout}')
except RuntimeError:
logger.exception(f'Error running cleanup on execution node {inst.hostname}')
@classmethod
def run(cls, **kwargs):
if settings.IS_K8S:
return
this_inst = Instance.objects.me()
cls.run_local(this_inst, **kwargs)
cls.run_remote(this_inst, **kwargs)
@task(queue='tower_broadcast_all')
def handle_removed_image(remove_images=None):
"""Special broadcast invocation of this method to handle case of deleted EE"""
_cleanup_images_and_files(remove_images=remove_images, file_pattern='')
CleanupImagesAndFiles.run(remove_images=remove_images, file_pattern='')
@task(queue=get_task_queuename)
def cleanup_images_and_files():
_cleanup_images_and_files(image_prune=True)
CleanupImagesAndFiles.run(image_prune=True)
@task(queue=get_task_queuename)

View File

@ -4,7 +4,7 @@ import tempfile
import shutil
from awx.main.tasks.jobs import RunJob
from awx.main.tasks.system import execution_node_health_check, _cleanup_images_and_files
from awx.main.tasks.system import CleanupImagesAndFiles, execution_node_health_check
from awx.main.models import Instance, Job
@ -48,22 +48,22 @@ def mock_job_folder(job_folder_factory):
@pytest.mark.django_db
def test_folder_cleanup_stale_file(mock_job_folder, mock_me):
_cleanup_images_and_files()
CleanupImagesAndFiles.run()
assert os.path.exists(mock_job_folder) # grace period should protect folder from deletion
_cleanup_images_and_files(grace_period=0)
CleanupImagesAndFiles.run(grace_period=0)
assert not os.path.exists(mock_job_folder) # should be deleted
@pytest.mark.django_db
def test_folder_cleanup_running_job(mock_job_folder, me_inst):
job = Job.objects.create(id=1234, controller_node=me_inst.hostname, status='running')
_cleanup_images_and_files(grace_period=0)
CleanupImagesAndFiles.run(grace_period=0)
assert os.path.exists(mock_job_folder) # running job should prevent folder from getting deleted
job.status = 'failed'
job.save(update_fields=['status'])
_cleanup_images_and_files(grace_period=0)
CleanupImagesAndFiles.run(grace_period=0)
assert not os.path.exists(mock_job_folder) # job is finished and no grace period, should delete
@ -78,7 +78,7 @@ def test_folder_cleanup_multiple_running_jobs(job_folder_factory, me_inst):
dirs.append(job_folder_factory(job.id))
jobs.append(job)
_cleanup_images_and_files(grace_period=0)
CleanupImagesAndFiles.run(grace_period=0)
assert [os.path.exists(d) for d in dirs] == [True for i in range(num_jobs)]

View File

@ -1,3 +1,4 @@
import subprocess
import time
import pytest
@ -59,3 +60,20 @@ def default_org():
def demo_inv(default_org):
inventory, _ = Inventory.objects.get_or_create(name='Demo Inventory', defaults={'organization': default_org})
return inventory
@pytest.fixture
def podman_image_generator():
"""
Generate a tagless podman image from awx base EE
"""
def fn():
dockerfile = """
FROM quay.io/ansible/awx-ee:latest
RUN echo "Hello, Podman!" > /tmp/hello.txt
"""
cmd = ['podman', 'build', '-f', '-'] # Create an image without a tag
subprocess.run(cmd, capture_output=True, input=dockerfile, text=True, check=True)
return fn

View File

@ -1,11 +1,21 @@
import os
import json
import pytest
import tempfile
import subprocess
from awx.main.tasks.receptor import _convert_args_to_cli
from unittest import mock
from awx.main.tasks.receptor import _convert_args_to_cli, run_until_complete
from awx.main.tasks.system import CleanupImagesAndFiles
from awx.main.models import Instance, JobTemplate
def get_podman_images():
cmd = ['podman', 'images', '--format', 'json']
return json.loads((subprocess.run(cmd, capture_output=True, text=True, check=True)).stdout)
def test_folder_cleanup_multiple_running_jobs_execution_node(request):
demo_jt = JobTemplate.objects.get(name='Demo Job Template')
@ -37,3 +47,36 @@ def test_folder_cleanup_multiple_running_jobs_execution_node(request):
print('ansible-runner worker ' + remote_command)
assert [os.path.exists(job_dir) for job_dir in job_dirs] == [True for i in range(3)]
@pytest.mark.parametrize(
'worktype',
('remote', 'local'),
)
def test_tagless_image(podman_image_generator, worktype: str):
"""
Ensure podman images on Control and Hybrid nodes are deleted during cleanup.
"""
podman_image_generator()
dangling_image = next((image for image in get_podman_images() if image.get('Dangling', False)), None)
assert dangling_image
instance_me = Instance.objects.me()
match worktype:
case 'local':
CleanupImagesAndFiles.run_local(instance_me, image_prune=True)
case 'remote':
with (
mock.patch(
'awx.main.tasks.receptor.run_until_complete', lambda *args, **kwargs: run_until_complete(*args, worktype='local', ttl=None, **kwargs)
),
mock.patch('awx.main.tasks.system.CleanupImagesAndFiles.get_execution_instances', lambda: [Instance.objects.me()]),
):
CleanupImagesAndFiles.run_remote(instance_me, image_prune=True)
case _:
raise ValueError(f'worktype "{worktype}" not supported.')
for image in get_podman_images():
assert image['Id'] != dangling_image['Id']