From 166090091439cd407a0ed881da748f2f2fcfadaf Mon Sep 17 00:00:00 2001 From: Shane McDonald Date: Wed, 13 Oct 2021 12:20:26 -0400 Subject: [PATCH 01/45] Dont fail CI when pre-built images arent available CI will build the image from scratch if the pre-build image is not available --- .github/workflows/ci.yml | 12 ++++++------ .github/workflows/upload_schema.yml | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 70e8b37b99..0953a16727 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -19,7 +19,7 @@ jobs: - name: Pre-pull image to warm build cache run: | - docker pull ghcr.io/${{ github.repository_owner }}/awx_devel:${{ env.BRANCH }} + docker pull ghcr.io/${{ github.repository_owner }}/awx_devel:${{ env.BRANCH }} || : - name: Build image run: | @@ -43,7 +43,7 @@ jobs: - name: Pre-pull image to warm build cache run: | - docker pull ghcr.io/${{ github.repository_owner }}/awx_devel:${{ env.BRANCH }} + docker pull ghcr.io/${{ github.repository_owner }}/awx_devel:${{ env.BRANCH }} || : - name: Build image run: | @@ -91,7 +91,7 @@ jobs: - name: Pre-pull image to warm build cache run: | - docker pull ghcr.io/${{ github.repository_owner }}/awx_devel:${{ env.BRANCH }} + docker pull ghcr.io/${{ github.repository_owner }}/awx_devel:${{ env.BRANCH }} || : - name: Build image run: | @@ -115,7 +115,7 @@ jobs: - name: Pre-pull image to warm build cache run: | - docker pull ghcr.io/${{ github.repository_owner }}/awx_devel:${{ env.BRANCH }} + docker pull ghcr.io/${{ github.repository_owner }}/awx_devel:${{ env.BRANCH }} || : - name: Build image run: | @@ -139,7 +139,7 @@ jobs: - name: Pre-pull image to warm build cache run: | - docker pull ghcr.io/${{ github.repository_owner }}/awx_devel:${{ env.BRANCH }} + docker pull ghcr.io/${{ github.repository_owner }}/awx_devel:${{ env.BRANCH }} || : - name: Build image run: | @@ -163,7 +163,7 @@ jobs: - name: Pre-pull image to warm build cache run: | - docker pull ghcr.io/${{ github.repository_owner }}/awx_devel:${{ env.BRANCH }} + docker pull ghcr.io/${{ github.repository_owner }}/awx_devel:${{ env.BRANCH }} || : - name: Build image run: | diff --git a/.github/workflows/upload_schema.yml b/.github/workflows/upload_schema.yml index 4d90f96a66..3b73e8c956 100644 --- a/.github/workflows/upload_schema.yml +++ b/.github/workflows/upload_schema.yml @@ -19,7 +19,7 @@ jobs: - name: Pre-pull image to warm build cache run: | - docker pull ghcr.io/${{ github.repository_owner }}/awx_devel:${GITHUB_REF##*/} + docker pull ghcr.io/${{ github.repository_owner }}/awx_devel:${GITHUB_REF##*/} || : - name: Build image run: | From 7b35902d33fdd07c5ee16644b5da9a387a290192 Mon Sep 17 00:00:00 2001 From: Alan Rominger Date: Tue, 12 Oct 2021 13:42:21 -0400 Subject: [PATCH 02/45] Respect settings to keep files and work units Add new logic to cleanup orphaned work units from administrative tasks Remove noisy log which is often irrelevant about running-cleanup-on-execution-nodes we already have other logs for this --- awx/main/models/ha.py | 4 +++- awx/main/tasks.py | 24 ++++++++++++++---------- awx/main/utils/receptor.py | 38 ++++++++++++++++++++++++++++++++++---- awx/settings/defaults.py | 4 +++- 4 files changed, 54 insertions(+), 16 deletions(-) diff --git a/awx/main/models/ha.py b/awx/main/models/ha.py index 1e9c6d983a..ea9f7d8d0e 100644 --- a/awx/main/models/ha.py +++ b/awx/main/models/ha.py @@ -162,7 +162,9 @@ class Instance(HasPolicyEditsMixin, BaseModel): returns a dict that is passed to the python interface for the runner method corresponding to that command any kwargs will override that key=value combination in the returned dict """ - vargs = dict(file_pattern='/tmp/{}*'.format(JOB_FOLDER_PREFIX % '*')) + vargs = dict() + if settings.AWX_CLEANUP_PATHS: + vargs['file_pattern'] = '/tmp/{}*'.format(JOB_FOLDER_PREFIX % '*') vargs.update(kwargs) if 'exclude_strings' not in vargs and vargs.get('file_pattern'): active_pks = list(UnifiedJob.objects.filter(execution_node=self.hostname, status__in=('running', 'waiting')).values_list('pk', flat=True)) diff --git a/awx/main/tasks.py b/awx/main/tasks.py index 42fb01d253..8cf1e39969 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -108,7 +108,7 @@ from awx.main.utils.safe_yaml import safe_dump, sanitize_jinja from awx.main.utils.reload import stop_local_services from awx.main.utils.pglock import advisory_lock from awx.main.utils.handlers import SpecialInventoryHandler -from awx.main.utils.receptor import get_receptor_ctl, worker_info, get_conn_type, get_tls_client, worker_cleanup +from awx.main.utils.receptor import get_receptor_ctl, worker_info, get_conn_type, get_tls_client, worker_cleanup, administrative_workunit_reaper from awx.main.consumers import emit_channel_notification from awx.main import analytics from awx.conf import settings_registry @@ -397,20 +397,22 @@ def _cleanup_images_and_files(**kwargs): return this_inst = Instance.objects.me() runner_cleanup_kwargs = this_inst.get_cleanup_task_kwargs(**kwargs) - stdout = '' - with StringIO() as buffer: - with redirect_stdout(buffer): - ansible_runner.cleanup.run_cleanup(runner_cleanup_kwargs) - stdout = buffer.getvalue() - if '(changed: True)' in stdout: - logger.info(f'Performed local cleanup with kwargs {kwargs}, output:\n{stdout}') + if runner_cleanup_kwargs: + stdout = '' + with StringIO() as buffer: + with redirect_stdout(buffer): + ansible_runner.cleanup.run_cleanup(runner_cleanup_kwargs) + stdout = buffer.getvalue() + if '(changed: True)' in stdout: + logger.info(f'Performed local cleanup with kwargs {kwargs}, output:\n{stdout}') # if we are the first instance alphabetically, then run cleanup on execution nodes checker_instance = Instance.objects.filter(node_type__in=['hybrid', 'control'], enabled=True, capacity__gt=0).order_by('-hostname').first() if checker_instance and this_inst.hostname == checker_instance.hostname: - logger.info(f'Running execution node cleanup with kwargs {kwargs}') for inst in Instance.objects.filter(node_type='execution', enabled=True, capacity__gt=0): runner_cleanup_kwargs = inst.get_cleanup_task_kwargs(**kwargs) + if not runner_cleanup_kwargs: + continue try: stdout = worker_cleanup(inst.hostname, runner_cleanup_kwargs) if '(changed: True)' in stdout: @@ -649,6 +651,8 @@ def awx_receptor_workunit_reaper(): receptor_ctl.simple_command(f"work cancel {job.work_unit_id}") receptor_ctl.simple_command(f"work release {job.work_unit_id}") + administrative_workunit_reaper(receptor_work_list) + @task(queue=get_local_queuename) def awx_k8s_reaper(): @@ -3184,7 +3188,7 @@ class AWXReceptorJob: receptor_params["secret_kube_config"] = kubeconfig_yaml else: private_data_dir = self.runner_params['private_data_dir'] - if self.work_type == 'ansible-runner': + if self.work_type == 'ansible-runner' and settings.AWX_CLEANUP_PATHS: # on execution nodes, we rely on the private data dir being deleted cli_params = f"--private-data-dir={private_data_dir} --delete" else: diff --git a/awx/main/utils/receptor.py b/awx/main/utils/receptor.py index b92b57c46a..cadf51a1b1 100644 --- a/awx/main/utils/receptor.py +++ b/awx/main/utils/receptor.py @@ -3,6 +3,7 @@ import yaml import time from receptorctl.socket_interface import ReceptorControl + from django.conf import settings from enum import Enum, unique @@ -11,6 +12,8 @@ logger = logging.getLogger('awx.main.utils.receptor') __RECEPTOR_CONF = '/etc/receptor/receptor.conf' +RECEPTOR_ACTIVE_STATES = ('Pending', 'Running') + @unique class ReceptorConnectionType(Enum): @@ -62,6 +65,32 @@ def get_conn_type(node_name, receptor_ctl): return ReceptorConnectionType(node.get('ConnType')) +def administrative_workunit_reaper(work_list=None): + """ + This releases completed work units that were spawned by actions inside of this module + specifically, this should catch any completed work unit left by + - worker_info + - worker_cleanup + These should ordinarily be released when the method finishes, but this is a + cleanup of last-resort, in case something went awry + """ + receptor_ctl = get_receptor_ctl() + if work_list is None: + work_list = receptor_ctl.simple_command("work list") + + for unit_id, work_data in work_list.items(): + extra_data = work_data.get('ExtraData') + if (extra_data is None) or (extra_data.get('RemoteWorkType') != 'ansible-runner'): + continue # if this is not ansible-runner work, we do not want to touch it + params = extra_data.get('RemoteParams', {}).get('params') + if not (params == '--worker-info' or params.startswith('cleanup')): + continue # if this is not a cleanup or health check, we do not want to touch it + if work_data.get('StateName') in RECEPTOR_ACTIVE_STATES: + continue # do not want to touch active work units + logger.info(f'Reaping orphaned work unit {unit_id} with params {params}') + receptor_ctl.simple_command(f"work release {unit_id}") + + class RemoteJobError(RuntimeError): pass @@ -95,7 +124,7 @@ def run_until_complete(node, timing_data=None, **kwargs): while run_timing < 20.0: status = receptor_ctl.simple_command(f'work status {unit_id}') state_name = status.get('StateName') - if state_name not in ('Pending', 'Running'): + if state_name not in RECEPTOR_ACTIVE_STATES: break run_timing = time.time() - run_start time.sleep(0.5) @@ -110,9 +139,10 @@ def run_until_complete(node, timing_data=None, **kwargs): finally: - res = receptor_ctl.simple_command(f"work release {unit_id}") - if res != {'released': unit_id}: - logger.warn(f'Could not confirm release of receptor work unit id {unit_id} from {node}, data: {res}') + if settings.RECEPTOR_RELEASE_WORK: + res = receptor_ctl.simple_command(f"work release {unit_id}") + if res != {'released': unit_id}: + logger.warn(f'Could not confirm release of receptor work unit id {unit_id} from {node}, data: {res}') receptor_ctl.close() diff --git a/awx/settings/defaults.py b/awx/settings/defaults.py index 3f949abeca..3fda6efff9 100644 --- a/awx/settings/defaults.py +++ b/awx/settings/defaults.py @@ -68,7 +68,6 @@ DATABASES = { # the K8S cluster where awx itself is running) IS_K8S = False -RECEPTOR_RELEASE_WORK = True AWX_CONTAINER_GROUP_K8S_API_TIMEOUT = 10 AWX_CONTAINER_GROUP_DEFAULT_NAMESPACE = os.getenv('MY_POD_NAMESPACE', 'default') # Timeout when waiting for pod to enter running state. If the pod is still in pending state , it will be terminated. Valid time units are "s", "m", "h". Example : "5m" , "10s". @@ -931,6 +930,9 @@ AWX_CALLBACK_PROFILE = False # Delete temporary directories created to store playbook run-time AWX_CLEANUP_PATHS = True +# Delete completed work units in receptor +RECEPTOR_RELEASE_WORK = True + MIDDLEWARE = [ 'django_guid.middleware.GuidMiddleware', 'awx.main.middleware.TimingMiddleware', From f72292cce2f7d70019f3346b2670f84af665ca76 Mon Sep 17 00:00:00 2001 From: Bianca Henderson Date: Thu, 14 Oct 2021 14:52:38 -0400 Subject: [PATCH 03/45] Move error handling into try/catch block --- awx/main/tasks.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/awx/main/tasks.py b/awx/main/tasks.py index 8cf1e39969..d51aac24cb 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -3122,9 +3122,14 @@ class AWXReceptorJob: resultsock.shutdown(socket.SHUT_RDWR) resultfile.close() elif res.status == 'error': - unit_status = receptor_ctl.simple_command(f'work status {self.unit_id}') - detail = unit_status['Detail'] - state_name = unit_status['StateName'] + try: + unit_status = receptor_ctl.simple_command(f'work status {self.unit_id}') + detail = unit_status.get('Detail', None) + state_name = unit_status.get('StateName', None) + except RuntimeError as e: + detail = '' + state_name = '' + logger.warn(e) if 'exceeded quota' in detail: logger.warn(detail) From 481047bed838228356057e5689f95729caafaecf Mon Sep 17 00:00:00 2001 From: Bianca Henderson Date: Fri, 15 Oct 2021 08:08:24 -0400 Subject: [PATCH 04/45] Change log level from 'warning' to 'exception' --- awx/main/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/awx/main/tasks.py b/awx/main/tasks.py index d51aac24cb..a298b08eea 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -3129,7 +3129,7 @@ class AWXReceptorJob: except RuntimeError as e: detail = '' state_name = '' - logger.warn(e) + logger.exception(e) if 'exceeded quota' in detail: logger.warn(detail) From 3065e29deb998959748a63f8248182f289c55b65 Mon Sep 17 00:00:00 2001 From: chris meyers Date: Fri, 15 Oct 2021 16:27:24 -0400 Subject: [PATCH 05/45] avoid work_results and work release race * Unsure exactly why this happens but there seems to be a race condition related to the time window between Receptor work_results and work release. This sleep extends that window and hopefully avoids the race condition. --- awx/main/tasks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/awx/main/tasks.py b/awx/main/tasks.py index a298b08eea..9ff1120ea5 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -3151,6 +3151,7 @@ class AWXReceptorJob: except Exception: raise RuntimeError(detail) + time.sleep(3) return res # Spawned in a thread so Receptor can start reading before we finish writing, we From d6b4b9f9736facf8497bbc84d33c817b8808c7ef Mon Sep 17 00:00:00 2001 From: Marcelo Moreira de Mello Date: Wed, 13 Oct 2021 11:36:15 -0400 Subject: [PATCH 06/45] Added node_type on awx-manage list_instances commmand (cherry picked from commit 683145e3eaa8b13da59bc51e57dff98f25d3554d) --- awx/main/management/commands/list_instances.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/awx/main/management/commands/list_instances.py b/awx/main/management/commands/list_instances.py index 7568f0b45c..bef9034774 100644 --- a/awx/main/management/commands/list_instances.py +++ b/awx/main/management/commands/list_instances.py @@ -47,7 +47,7 @@ class Command(BaseCommand): color = '\033[90m[DISABLED] ' if no_color: color = '' - fmt = '\t' + color + '{0.hostname} capacity={0.capacity} version={1}' + fmt = '\t' + color + '{0.hostname} capacity={0.capacity} node_type={0.node_type} version={1}' if x.capacity: fmt += ' heartbeat="{0.modified:%Y-%m-%d %H:%M:%S}"' print((fmt + '\033[0m').format(x, x.version or '?')) From 206c85778ebcfcf9f424c2f8e389ec8817bdc61e Mon Sep 17 00:00:00 2001 From: nixocio Date: Fri, 15 Oct 2021 09:52:13 -0400 Subject: [PATCH 07/45] Do not show control instances as option to be associated Do not show control instances as option to be associated to user defined instance groups. See: https://github.com/ansible/tower/issues/5339 --- awx/ui/src/screens/InstanceGroup/Instances/InstanceList.js | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/awx/ui/src/screens/InstanceGroup/Instances/InstanceList.js b/awx/ui/src/screens/InstanceGroup/Instances/InstanceList.js index f87c6cae62..f0a87fe653 100644 --- a/awx/ui/src/screens/InstanceGroup/Instances/InstanceList.js +++ b/awx/ui/src/screens/InstanceGroup/Instances/InstanceList.js @@ -147,7 +147,10 @@ function InstanceList() { const fetchInstancesToAssociate = useCallback( (params) => InstancesAPI.read( - mergeParams(params, { not__rampart_groups__id: instanceGroupId }) + mergeParams(params, { + ...{ not__rampart_groups__id: instanceGroupId }, + ...{ not__node_type: 'control' }, + }) ), [instanceGroupId] ); From f34c96ecf524d8e77caf2ea9272123dbac4359ec Mon Sep 17 00:00:00 2001 From: Alan Rominger Date: Fri, 15 Oct 2021 11:23:38 -0400 Subject: [PATCH 08/45] Error handling when node is missing from mesh for jobs and checks --- awx/main/exceptions.py | 4 ++++ awx/main/tasks.py | 6 ++++-- awx/main/utils/receptor.py | 8 +++++++- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/awx/main/exceptions.py b/awx/main/exceptions.py index 6a9bb7ece4..2cd9a44418 100644 --- a/awx/main/exceptions.py +++ b/awx/main/exceptions.py @@ -36,3 +36,7 @@ class PostRunError(Exception): self.status = status self.tb = tb super(PostRunError, self).__init__(msg) + + +class ReceptorNodeNotFound(RuntimeError): + pass diff --git a/awx/main/tasks.py b/awx/main/tasks.py index 9ff1120ea5..94d348400e 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -85,7 +85,7 @@ from awx.main.models import ( build_safe_env, ) from awx.main.constants import ACTIVE_STATES -from awx.main.exceptions import AwxTaskError, PostRunError +from awx.main.exceptions import AwxTaskError, PostRunError, ReceptorNodeNotFound from awx.main.queue import CallbackQueueDispatcher from awx.main.dispatch.publish import task from awx.main.dispatch import get_local_queuename, reaper @@ -1546,6 +1546,8 @@ class BaseTask(object): # ensure failure notification sends even if playbook_on_stats event is not triggered handle_success_and_failure_notifications.apply_async([self.instance.job.id]) + except ReceptorNodeNotFound as exc: + extra_update_fields['job_explanation'] = str(exc) except Exception: # this could catch programming or file system errors extra_update_fields['result_traceback'] = traceback.format_exc() @@ -3069,7 +3071,7 @@ class AWXReceptorJob: receptor_ctl.simple_command(f"work release {self.unit_id}") # If an error occured without the job itself failing, it could be a broken instance if self.work_type == 'ansible-runner' and ((res is None) or (getattr(res, 'rc', None) is None)): - execution_node_health_check(self.task.instance.execution_node) + execution_node_health_check.delay(self.task.instance.execution_node) @property def sign_work(self): diff --git a/awx/main/utils/receptor.py b/awx/main/utils/receptor.py index cadf51a1b1..edc3887587 100644 --- a/awx/main/utils/receptor.py +++ b/awx/main/utils/receptor.py @@ -1,12 +1,14 @@ import logging import yaml import time +from enum import Enum, unique from receptorctl.socket_interface import ReceptorControl +from awx.main.exceptions import ReceptorNodeNotFound + from django.conf import settings -from enum import Enum, unique logger = logging.getLogger('awx.main.utils.receptor') @@ -63,6 +65,7 @@ def get_conn_type(node_name, receptor_ctl): for node in all_nodes: if node.get('NodeID') == node_name: return ReceptorConnectionType(node.get('ConnType')) + raise ReceptorNodeNotFound(f'Instance {node_name} is not in the receptor mesh') def administrative_workunit_reaper(work_list=None): @@ -183,6 +186,9 @@ def worker_info(node_name, work_type='ansible-runner'): else: error_list.append(details) + except ReceptorNodeNotFound as exc: + error_list.append(str(exc)) + # If we have a connection error, missing keys would be trivial consequence of that if not data['errors']: # see tasks.py usage of keys From 0d3a22bbc3308d8d588501db85f46668e566b942 Mon Sep 17 00:00:00 2001 From: Alex Corey Date: Tue, 19 Oct 2021 09:20:41 -0400 Subject: [PATCH 09/45] Fixes erroneous validation --- awx/ui/src/components/LaunchPrompt/steps/useSurveyStep.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/awx/ui/src/components/LaunchPrompt/steps/useSurveyStep.js b/awx/ui/src/components/LaunchPrompt/steps/useSurveyStep.js index 19484747fa..a19bc46a57 100644 --- a/awx/ui/src/components/LaunchPrompt/steps/useSurveyStep.js +++ b/awx/ui/src/components/LaunchPrompt/steps/useSurveyStep.js @@ -128,9 +128,9 @@ function checkForError(launchConfig, surveyConfig, values) { hasError = true; } } - if (isNumeric && (value || value === 0)) { + if (isNumeric) { if ( - (value < question.min || value > question.max) && + (value < question.min || value > question.max || value === '') && question.required ) { hasError = true; From 6f20a798abef14104b0d0c36a74e1f8b30c97afa Mon Sep 17 00:00:00 2001 From: Alan Rominger Date: Wed, 13 Oct 2021 16:36:14 -0400 Subject: [PATCH 10/45] Allow testing a single hybrid instance like the good old days --- tools/docker-compose/ansible/roles/sources/tasks/main.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/docker-compose/ansible/roles/sources/tasks/main.yml b/tools/docker-compose/ansible/roles/sources/tasks/main.yml index b6df968ed7..7b442aa3bc 100644 --- a/tools/docker-compose/ansible/roles/sources/tasks/main.yml +++ b/tools/docker-compose/ansible/roles/sources/tasks/main.yml @@ -113,4 +113,5 @@ src: "receptor-worker.conf.j2" dest: "{{ sources_dest }}/receptor/receptor-worker-{{ item }}.conf" mode: '0600' - with_sequence: start=1 end={{ execution_node_count }} + with_sequence: start=1 end={{ execution_node_count if execution_node_count | int > 0 else 1}} + when: execution_node_count | int > 0 From 77076dbd6762caa7df19b7331c354270e47306ac Mon Sep 17 00:00:00 2001 From: Alan Rominger Date: Tue, 19 Oct 2021 14:44:03 -0400 Subject: [PATCH 11/45] Reduce the number of triggers for execution node health checks --- awx/main/tasks.py | 5 +---- awx/settings/defaults.py | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/awx/main/tasks.py b/awx/main/tasks.py index 94d348400e..0ff7820ea5 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -534,7 +534,7 @@ def inspect_execution_nodes(instance_list): # check logger.warn(f'Execution node attempting to rejoin as instance {hostname}.') execution_node_health_check.apply_async([hostname]) - elif instance.capacity == 0: + elif instance.capacity == 0 and instance.enabled: # nodes with proven connection but need remediation run health checks are reduced frequency if not instance.last_health_check or (nowtime - instance.last_health_check).total_seconds() >= settings.EXECUTION_NODE_REMEDIATION_CHECKS: # Periodically re-run the health check of errored nodes, in case someone fixed it @@ -3069,9 +3069,6 @@ class AWXReceptorJob: # Make sure to always release the work unit if we established it if self.unit_id is not None and settings.RECEPTOR_RELEASE_WORK: receptor_ctl.simple_command(f"work release {self.unit_id}") - # If an error occured without the job itself failing, it could be a broken instance - if self.work_type == 'ansible-runner' and ((res is None) or (getattr(res, 'rc', None) is None)): - execution_node_health_check.delay(self.task.instance.execution_node) @property def sign_work(self): diff --git a/awx/settings/defaults.py b/awx/settings/defaults.py index 3fda6efff9..0b5bd0b4b6 100644 --- a/awx/settings/defaults.py +++ b/awx/settings/defaults.py @@ -425,7 +425,7 @@ os.environ.setdefault('DJANGO_LIVE_TEST_SERVER_ADDRESS', 'localhost:9013-9199') # heartbeat period can factor into some forms of logic, so it is maintained as a setting here CLUSTER_NODE_HEARTBEAT_PERIOD = 60 RECEPTOR_SERVICE_ADVERTISEMENT_PERIOD = 60 # https://github.com/ansible/receptor/blob/aa1d589e154d8a0cb99a220aff8f98faf2273be6/pkg/netceptor/netceptor.go#L34 -EXECUTION_NODE_REMEDIATION_CHECKS = 60 * 10 # once every 10 minutes check if an execution node errors have been resolved +EXECUTION_NODE_REMEDIATION_CHECKS = 60 * 30 # once every 30 minutes check if an execution node errors have been resolved BROKER_URL = 'unix:///var/run/redis/redis.sock' CELERYBEAT_SCHEDULE = { From e54db3ce50733b031f1e9de638245fabdd0feb23 Mon Sep 17 00:00:00 2001 From: Alan Rominger Date: Fri, 15 Oct 2021 15:03:59 -0400 Subject: [PATCH 12/45] Gracefully handle receptorctl RuntimeError in health check --- awx/main/tests/functional/api/test_instance.py | 14 ++++++++++++++ awx/main/utils/receptor.py | 2 +- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/awx/main/tests/functional/api/test_instance.py b/awx/main/tests/functional/api/test_instance.py index b94b860b01..b840ba7f29 100644 --- a/awx/main/tests/functional/api/test_instance.py +++ b/awx/main/tests/functional/api/test_instance.py @@ -50,6 +50,20 @@ def test_auditor_user_health_check(get, post, system_auditor): post(url=url, user=system_auditor, expect=403) +@pytest.mark.django_db +def test_health_check_throws_error(post, admin_user): + instance = Instance.objects.create(node_type='execution', **INSTANCE_KWARGS) + url = reverse('api:instance_health_check', kwargs={'pk': instance.pk}) + # we will simulate a receptor error, similar to this one + # https://github.com/ansible/receptor/blob/156e6e24a49fbf868734507f9943ac96208ed8f5/receptorctl/receptorctl/socket_interface.py#L204 + # related to issue https://github.com/ansible/tower/issues/5315 + with mock.patch('awx.main.utils.receptor.run_until_complete', side_effect=RuntimeError('Remote error: foobar')): + post(url=url, user=admin_user, expect=200) + instance.refresh_from_db() + assert 'Remote error: foobar' in instance.errors + assert instance.capacity == 0 + + @pytest.mark.django_db @mock.patch.object(redis.client.Redis, 'ping', lambda self: True) def test_health_check_usage(get, post, admin_user): diff --git a/awx/main/utils/receptor.py b/awx/main/utils/receptor.py index edc3887587..19d76afd90 100644 --- a/awx/main/utils/receptor.py +++ b/awx/main/utils/receptor.py @@ -186,7 +186,7 @@ def worker_info(node_name, work_type='ansible-runner'): else: error_list.append(details) - except ReceptorNodeNotFound as exc: + except (ReceptorNodeNotFound, RuntimeError) as exc: error_list.append(str(exc)) # If we have a connection error, missing keys would be trivial consequence of that From e04efad3c0485818a8330eefc9a9a8f819886e60 Mon Sep 17 00:00:00 2001 From: Seth Foster Date: Tue, 19 Oct 2021 17:11:09 -0400 Subject: [PATCH 13/45] tools_receptor_1 should use whatever awx_devel tag that tools_awx_1 is using --- .../ansible/roles/sources/templates/docker-compose.yml.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 b/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 index d623c82ef1..d89a733ed1 100644 --- a/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 +++ b/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 @@ -111,7 +111,7 @@ services: - "../../docker-compose/_sources/receptor/receptor-hop.conf:/etc/receptor/receptor.conf" {% for i in range(execution_node_count|int) -%} receptor-{{ loop.index }}: - image: quay.io/awx/awx_devel:devel + image: "{{ awx_image }}:{{ awx_image_tag }}" user: "{{ ansible_user_uid }}" container_name: tools_receptor_{{ loop.index }} hostname: receptor-{{ loop.index }} From 1e5231d68ba5999032c7094bb81f01b8db7cf860 Mon Sep 17 00:00:00 2001 From: Jeff Bradberry Date: Tue, 19 Oct 2021 14:37:48 -0400 Subject: [PATCH 14/45] Enable ActivityStream capture for Instances --- awx/api/serializers.py | 1 + awx/main/models/__init__.py | 1 + awx/main/signals.py | 1 + awx/main/tasks.py | 3 +++ 4 files changed, 6 insertions(+) diff --git a/awx/api/serializers.py b/awx/api/serializers.py index c3e2a427f9..e8df13ab3e 100644 --- a/awx/api/serializers.py +++ b/awx/api/serializers.py @@ -5003,6 +5003,7 @@ class ActivityStreamSerializer(BaseSerializer): ('credential_type', ('id', 'name', 'description', 'kind', 'managed')), ('ad_hoc_command', ('id', 'name', 'status', 'limit')), ('workflow_approval', ('id', 'name', 'unified_job_id')), + ('instance', ('id', 'hostname')), ] return field_list diff --git a/awx/main/models/__init__.py b/awx/main/models/__init__.py index 0fab2cd4f6..29b64a0758 100644 --- a/awx/main/models/__init__.py +++ b/awx/main/models/__init__.py @@ -201,6 +201,7 @@ activity_stream_registrar.connect(Organization) activity_stream_registrar.connect(Inventory) activity_stream_registrar.connect(Host) activity_stream_registrar.connect(Group) +activity_stream_registrar.connect(Instance) activity_stream_registrar.connect(InventorySource) # activity_stream_registrar.connect(InventoryUpdate) activity_stream_registrar.connect(Credential) diff --git a/awx/main/signals.py b/awx/main/signals.py index 5caf7b45a8..d8931dc266 100644 --- a/awx/main/signals.py +++ b/awx/main/signals.py @@ -377,6 +377,7 @@ def model_serializer_mapping(): models.Inventory: serializers.InventorySerializer, models.Host: serializers.HostSerializer, models.Group: serializers.GroupSerializer, + models.Instance: serializers.InstanceSerializer, models.InstanceGroup: serializers.InstanceGroupSerializer, models.InventorySource: serializers.InventorySourceSerializer, models.Credential: serializers.CredentialSerializer, diff --git a/awx/main/tasks.py b/awx/main/tasks.py index 0ff7820ea5..9e933f17ba 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -288,6 +288,9 @@ def apply_cluster_membership_policies(): continue instances_to_add = set(g.instances) - set(g.prior_instances) instances_to_remove = set(g.prior_instances) - set(g.instances) + # The following writes to the db don't spam the activity stream, because + # InstanceGroup is special-cased in signals.py to connect to only the non-m2m + # signal handlers. if instances_to_add: logger.debug('Adding instances {} to group {}'.format(list(instances_to_add), g.obj.name)) g.obj.instances.add(*instances_to_add) From 62d50d27bec5095bf40e19ae59f24b0477c45457 Mon Sep 17 00:00:00 2001 From: Jeff Bradberry Date: Tue, 19 Oct 2021 16:56:46 -0400 Subject: [PATCH 15/45] Update a couple of the existing tests --- awx/main/tests/functional/api/test_instance.py | 5 +++++ .../functional/models/test_activity_stream.py | 2 +- awx/main/tests/functional/test_instances.py | 14 +++++++++++++- 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/awx/main/tests/functional/api/test_instance.py b/awx/main/tests/functional/api/test_instance.py index b840ba7f29..c65cea0c01 100644 --- a/awx/main/tests/functional/api/test_instance.py +++ b/awx/main/tests/functional/api/test_instance.py @@ -3,6 +3,7 @@ import pytest from unittest import mock from awx.api.versioning import reverse +from awx.main.models.activity_stream import ActivityStream from awx.main.models.ha import Instance import redis @@ -17,6 +18,7 @@ INSTANCE_KWARGS = dict(hostname='example-host', cpu=6, memory=36000000000, cpu_c @pytest.mark.django_db def test_disabled_zeros_capacity(patch, admin_user): instance = Instance.objects.create(**INSTANCE_KWARGS) + assert ActivityStream.objects.filter(instance=instance).count() == 1 url = reverse('api:instance_detail', kwargs={'pk': instance.pk}) @@ -25,12 +27,14 @@ def test_disabled_zeros_capacity(patch, admin_user): instance.refresh_from_db() assert instance.capacity == 0 + assert ActivityStream.objects.filter(instance=instance).count() == 2 @pytest.mark.django_db def test_enabled_sets_capacity(patch, admin_user): instance = Instance.objects.create(enabled=False, capacity=0, **INSTANCE_KWARGS) assert instance.capacity == 0 + assert ActivityStream.objects.filter(instance=instance).count() == 1 url = reverse('api:instance_detail', kwargs={'pk': instance.pk}) @@ -39,6 +43,7 @@ def test_enabled_sets_capacity(patch, admin_user): instance.refresh_from_db() assert instance.capacity > 0 + assert ActivityStream.objects.filter(instance=instance).count() == 2 @pytest.mark.django_db diff --git a/awx/main/tests/functional/models/test_activity_stream.py b/awx/main/tests/functional/models/test_activity_stream.py index bc6c3e8c51..f8ae40b540 100644 --- a/awx/main/tests/functional/models/test_activity_stream.py +++ b/awx/main/tests/functional/models/test_activity_stream.py @@ -170,7 +170,7 @@ def test_activity_stream_actor(admin_user): @pytest.mark.django_db -def test_annon_user_action(): +def test_anon_user_action(): with mock.patch('awx.main.signals.get_current_user') as u_mock: u_mock.return_value = AnonymousUser() inv = Inventory.objects.create(name='ainventory') diff --git a/awx/main/tests/functional/test_instances.py b/awx/main/tests/functional/test_instances.py index a0a06b4ae5..21a17ff2b5 100644 --- a/awx/main/tests/functional/test_instances.py +++ b/awx/main/tests/functional/test_instances.py @@ -2,6 +2,7 @@ import pytest from unittest import mock from awx.main.models import AdHocCommand, InventoryUpdate, JobTemplate, ProjectUpdate +from awx.main.models.activity_stream import ActivityStream from awx.main.models.ha import Instance, InstanceGroup from awx.main.tasks import apply_cluster_membership_policies from awx.api.versioning import reverse @@ -72,6 +73,7 @@ def test_instance_dup(org_admin, organization, project, instance_factory, instan i1 = instance_factory("i1") i2 = instance_factory("i2") i3 = instance_factory("i3") + ig_all = instance_group_factory("all", instances=[i1, i2, i3]) ig_dup = instance_group_factory("duplicates", instances=[i1]) project.organization.instance_groups.add(ig_all, ig_dup) @@ -83,7 +85,7 @@ def test_instance_dup(org_admin, organization, project, instance_factory, instan api_num_instances_oa = list(list_response2.data.items())[0][1] assert actual_num_instances == api_num_instances_auditor - # Note: The org_admin will not see the default 'tower' node (instance fixture) because it is not in it's group, as expected + # Note: The org_admin will not see the default 'tower' node (instance fixture) because it is not in its group, as expected assert api_num_instances_oa == (actual_num_instances - 1) @@ -94,7 +96,13 @@ def test_policy_instance_few_instances(instance_factory, instance_group_factory) ig_2 = instance_group_factory("ig2", percentage=25) ig_3 = instance_group_factory("ig3", percentage=25) ig_4 = instance_group_factory("ig4", percentage=25) + + count = ActivityStream.objects.count() + apply_cluster_membership_policies() + # running apply_cluster_membership_policies shouldn't spam the activity stream + assert ActivityStream.objects.count() == count + assert len(ig_1.instances.all()) == 1 assert i1 in ig_1.instances.all() assert len(ig_2.instances.all()) == 1 @@ -103,8 +111,12 @@ def test_policy_instance_few_instances(instance_factory, instance_group_factory) assert i1 in ig_3.instances.all() assert len(ig_4.instances.all()) == 1 assert i1 in ig_4.instances.all() + i2 = instance_factory("i2") + count += 1 apply_cluster_membership_policies() + assert ActivityStream.objects.count() == count + assert len(ig_1.instances.all()) == 1 assert i1 in ig_1.instances.all() assert len(ig_2.instances.all()) == 1 From 7010015e8a4ae184f106e799d501f7d6fdd59f45 Mon Sep 17 00:00:00 2001 From: Jeff Bradberry Date: Thu, 21 Oct 2021 10:08:42 -0400 Subject: [PATCH 16/45] Change the ActivityStream registration for InstanceGroups to include the m2m fields. Also to avoid spamminess, disable the activity stream on the apply_cluster_membership_policies task. --- awx/main/models/__init__.py | 1 + awx/main/signals.py | 6 ------ awx/main/tasks.py | 30 +++++++++++++++--------------- 3 files changed, 16 insertions(+), 21 deletions(-) diff --git a/awx/main/models/__init__.py b/awx/main/models/__init__.py index 29b64a0758..f439a692fb 100644 --- a/awx/main/models/__init__.py +++ b/awx/main/models/__init__.py @@ -202,6 +202,7 @@ activity_stream_registrar.connect(Inventory) activity_stream_registrar.connect(Host) activity_stream_registrar.connect(Group) activity_stream_registrar.connect(Instance) +activity_stream_registrar.connect(InstanceGroup) activity_stream_registrar.connect(InventorySource) # activity_stream_registrar.connect(InventoryUpdate) activity_stream_registrar.connect(Credential) diff --git a/awx/main/signals.py b/awx/main/signals.py index d8931dc266..3cc57abb6a 100644 --- a/awx/main/signals.py +++ b/awx/main/signals.py @@ -676,9 +676,3 @@ def create_access_token_user_if_missing(sender, **kwargs): post_save.disconnect(create_access_token_user_if_missing, sender=OAuth2AccessToken) obj.save() post_save.connect(create_access_token_user_if_missing, sender=OAuth2AccessToken) - - -# Connect the Instance Group to Activity Stream receivers. -post_save.connect(activity_stream_create, sender=InstanceGroup, dispatch_uid=str(InstanceGroup) + "_create") -pre_save.connect(activity_stream_update, sender=InstanceGroup, dispatch_uid=str(InstanceGroup) + "_update") -pre_delete.connect(activity_stream_delete, sender=InstanceGroup, dispatch_uid=str(InstanceGroup) + "_delete") diff --git a/awx/main/tasks.py b/awx/main/tasks.py index 9e933f17ba..90b5672241 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -191,6 +191,8 @@ def inform_cluster_of_shutdown(): @task(queue=get_local_queuename) def apply_cluster_membership_policies(): + from awx.main.signals import disable_activity_stream + started_waiting = time.time() with advisory_lock('cluster_policy_lock', wait=True): lock_time = time.time() - started_waiting @@ -282,21 +284,19 @@ def apply_cluster_membership_policies(): # On a differential basis, apply instances to groups with transaction.atomic(): - for g in actual_groups: - if g.obj.is_container_group: - logger.debug('Skipping containerized group {} for policy calculation'.format(g.obj.name)) - continue - instances_to_add = set(g.instances) - set(g.prior_instances) - instances_to_remove = set(g.prior_instances) - set(g.instances) - # The following writes to the db don't spam the activity stream, because - # InstanceGroup is special-cased in signals.py to connect to only the non-m2m - # signal handlers. - if instances_to_add: - logger.debug('Adding instances {} to group {}'.format(list(instances_to_add), g.obj.name)) - g.obj.instances.add(*instances_to_add) - if instances_to_remove: - logger.debug('Removing instances {} from group {}'.format(list(instances_to_remove), g.obj.name)) - g.obj.instances.remove(*instances_to_remove) + with disable_activity_stream(): + for g in actual_groups: + if g.obj.is_container_group: + logger.debug('Skipping containerized group {} for policy calculation'.format(g.obj.name)) + continue + instances_to_add = set(g.instances) - set(g.prior_instances) + instances_to_remove = set(g.prior_instances) - set(g.instances) + if instances_to_add: + logger.debug('Adding instances {} to group {}'.format(list(instances_to_add), g.obj.name)) + g.obj.instances.add(*instances_to_add) + if instances_to_remove: + logger.debug('Removing instances {} from group {}'.format(list(instances_to_remove), g.obj.name)) + g.obj.instances.remove(*instances_to_remove) logger.debug('Cluster policy computation finished in {} seconds'.format(time.time() - started_compute)) From 056247a34a047e06cc2579f7f1f89f12c406931b Mon Sep 17 00:00:00 2001 From: Jeff Bradberry Date: Thu, 21 Oct 2021 10:51:44 -0400 Subject: [PATCH 17/45] Adjust Instance-InstanceGroup tests to show that the ActivityStream is captured --- awx/main/signals.py | 1 - .../functional/api/test_instance_group.py | 57 +++++++++++++++++++ 2 files changed, 57 insertions(+), 1 deletion(-) diff --git a/awx/main/signals.py b/awx/main/signals.py index 3cc57abb6a..8dde65342d 100644 --- a/awx/main/signals.py +++ b/awx/main/signals.py @@ -34,7 +34,6 @@ from awx.main.models import ( ExecutionEnvironment, Group, Host, - InstanceGroup, Inventory, InventorySource, Job, diff --git a/awx/main/tests/functional/api/test_instance_group.py b/awx/main/tests/functional/api/test_instance_group.py index 5a787b6607..97b8abcff2 100644 --- a/awx/main/tests/functional/api/test_instance_group.py +++ b/awx/main/tests/functional/api/test_instance_group.py @@ -4,6 +4,7 @@ import pytest from awx.api.versioning import reverse from awx.main.models import ( + ActivityStream, Instance, InstanceGroup, ProjectUpdate, @@ -213,9 +214,23 @@ def test_containerized_group_default_fields(instance_group, kube_credential): def test_instance_attach_to_instance_group(post, instance_group, node_type_instance, admin, node_type): instance = node_type_instance(hostname=node_type, node_type=node_type) + count = ActivityStream.objects.count() + url = reverse(f'api:instance_group_instance_list', kwargs={'pk': instance_group.pk}) post(url, {'associate': True, 'id': instance.id}, admin, expect=204 if node_type != 'control' else 400) + new_activity = ActivityStream.objects.all()[count:] + if node_type != 'control': + assert len(new_activity) == 2 # the second is an update of the instance group policy + new_activity = new_activity[0] + assert new_activity.operation == 'associate' + assert new_activity.object1 == 'instance_group' + assert new_activity.object2 == 'instance' + assert new_activity.instance.first() == instance + assert new_activity.instance_group.first() == instance_group + else: + assert not new_activity + @pytest.mark.django_db @pytest.mark.parametrize('node_type', ['control', 'hybrid', 'execution']) @@ -223,18 +238,46 @@ def test_instance_unattach_from_instance_group(post, instance_group, node_type_i instance = node_type_instance(hostname=node_type, node_type=node_type) instance_group.instances.add(instance) + count = ActivityStream.objects.count() + url = reverse(f'api:instance_group_instance_list', kwargs={'pk': instance_group.pk}) post(url, {'disassociate': True, 'id': instance.id}, admin, expect=204 if node_type != 'control' else 400) + new_activity = ActivityStream.objects.all()[count:] + if node_type != 'control': + assert len(new_activity) == 1 + new_activity = new_activity[0] + assert new_activity.operation == 'disassociate' + assert new_activity.object1 == 'instance_group' + assert new_activity.object2 == 'instance' + assert new_activity.instance.first() == instance + assert new_activity.instance_group.first() == instance_group + else: + assert not new_activity + @pytest.mark.django_db @pytest.mark.parametrize('node_type', ['control', 'hybrid', 'execution']) def test_instance_group_attach_to_instance(post, instance_group, node_type_instance, admin, node_type): instance = node_type_instance(hostname=node_type, node_type=node_type) + count = ActivityStream.objects.count() + url = reverse(f'api:instance_instance_groups_list', kwargs={'pk': instance.pk}) post(url, {'associate': True, 'id': instance_group.id}, admin, expect=204 if node_type != 'control' else 400) + new_activity = ActivityStream.objects.all()[count:] + if node_type != 'control': + assert len(new_activity) == 2 # the second is an update of the instance group policy + new_activity = new_activity[0] + assert new_activity.operation == 'associate' + assert new_activity.object1 == 'instance' + assert new_activity.object2 == 'instance_group' + assert new_activity.instance.first() == instance + assert new_activity.instance_group.first() == instance_group + else: + assert not new_activity + @pytest.mark.django_db @pytest.mark.parametrize('node_type', ['control', 'hybrid', 'execution']) @@ -242,5 +285,19 @@ def test_instance_group_unattach_from_instance(post, instance_group, node_type_i instance = node_type_instance(hostname=node_type, node_type=node_type) instance_group.instances.add(instance) + count = ActivityStream.objects.count() + url = reverse(f'api:instance_instance_groups_list', kwargs={'pk': instance.pk}) post(url, {'disassociate': True, 'id': instance_group.id}, admin, expect=204 if node_type != 'control' else 400) + + new_activity = ActivityStream.objects.all()[count:] + if node_type != 'control': + assert len(new_activity) == 1 + new_activity = new_activity[0] + assert new_activity.operation == 'disassociate' + assert new_activity.object1 == 'instance' + assert new_activity.object2 == 'instance_group' + assert new_activity.instance.first() == instance + assert new_activity.instance_group.first() == instance_group + else: + assert not new_activity From d1fecc11c95a9482838ab1a158ec2d062d7512d3 Mon Sep 17 00:00:00 2001 From: Jim Ladd Date: Fri, 15 Oct 2021 16:45:21 -0700 Subject: [PATCH 18/45] when releasing receptor work, do so in try/except --- awx/main/tasks.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/awx/main/tasks.py b/awx/main/tasks.py index 90b5672241..e5a30098a6 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -3071,7 +3071,13 @@ class AWXReceptorJob: finally: # Make sure to always release the work unit if we established it if self.unit_id is not None and settings.RECEPTOR_RELEASE_WORK: - receptor_ctl.simple_command(f"work release {self.unit_id}") + try: + receptor_ctl.simple_command(f"work release {self.unit_id}") + except RuntimeError: + logger.exception(f"Unable to release work item {self.unit_id}") + # If an error occured without the job itself failing, it could be a broken instance + if self.work_type == 'ansible-runner' and ((res is None) or (getattr(res, 'rc', None) is None)): + execution_node_health_check.delay(self.task.instance.execution_node) @property def sign_work(self): From ebb45815951386922c86678f90a5ad5b0ccdaa8a Mon Sep 17 00:00:00 2001 From: Jim Ladd Date: Fri, 15 Oct 2021 16:46:15 -0700 Subject: [PATCH 19/45] update exception log message to be descriptive .. instead of surfacing exception --- awx/main/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/awx/main/tasks.py b/awx/main/tasks.py index e5a30098a6..b06deaabf7 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -3137,7 +3137,7 @@ class AWXReceptorJob: except RuntimeError as e: detail = '' state_name = '' - logger.exception(e) + logger.exception(f'Unable to retrieve work status for {self.unit_id}') if 'exceeded quota' in detail: logger.warn(detail) From 26055de7723051142a64241e3b4a13aba77ffd0e Mon Sep 17 00:00:00 2001 From: Jim Ladd Date: Fri, 15 Oct 2021 16:49:14 -0700 Subject: [PATCH 20/45] cancel job if receptor no longer knows about the work item --- awx/main/tasks.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/awx/main/tasks.py b/awx/main/tasks.py index b06deaabf7..327a1db1a8 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -3134,7 +3134,7 @@ class AWXReceptorJob: unit_status = receptor_ctl.simple_command(f'work status {self.unit_id}') detail = unit_status.get('Detail', None) state_name = unit_status.get('StateName', None) - except RuntimeError as e: + except RuntimeError: detail = '' state_name = '' logger.exception(f'Unable to retrieve work status for {self.unit_id}') @@ -3145,6 +3145,11 @@ class AWXReceptorJob: logger.warn(f"Could not launch pod for {log_name}. Exceeded quota.") self.task.update_model(self.task.instance.pk, status='pending') return + + # if we did not exceed the quota, continue with shutting down the job + resultsock.shutdown(socket.SHUT_RDWR) + resultfile.close() + # If ansible-runner ran, but an error occured at runtime, the traceback information # is saved via the status_handler passed in to the processor. if state_name == 'Succeeded': @@ -3224,10 +3229,21 @@ class AWXReceptorJob: @cleanup_new_process def cancel_watcher(self, processor_future): + receptor_ctl = get_receptor_ctl() while True: if processor_future.done(): return processor_future.result() + # cancel job if receptor no longer knows about work item + try: + receptor_ctl.simple_command(f'work status {self.unit_id}') + except RuntimeError: + self.task.instance.result_traceback = traceback.format_exc() + self.task.instance.save(update_fields=['result_traceback']) + + result = namedtuple('result', ['status', 'rc']) + return result('error', 1) + if self.task.cancel_callback(): result = namedtuple('result', ['status', 'rc']) return result('canceled', 1) From eb6c58682d1fb8bf05224e6309bb0f2378937568 Mon Sep 17 00:00:00 2001 From: Alan Rominger Date: Mon, 18 Oct 2021 15:26:07 -0400 Subject: [PATCH 21/45] Alternative for reaping lost jobs, in work unit reaper --- awx/main/tasks.py | 55 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 39 insertions(+), 16 deletions(-) diff --git a/awx/main/tasks.py b/awx/main/tasks.py index 327a1db1a8..fd032653e2 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -108,7 +108,15 @@ from awx.main.utils.safe_yaml import safe_dump, sanitize_jinja from awx.main.utils.reload import stop_local_services from awx.main.utils.pglock import advisory_lock from awx.main.utils.handlers import SpecialInventoryHandler -from awx.main.utils.receptor import get_receptor_ctl, worker_info, get_conn_type, get_tls_client, worker_cleanup, administrative_workunit_reaper +from awx.main.utils.receptor import ( + get_receptor_ctl, + worker_info, + get_conn_type, + get_tls_client, + worker_cleanup, + administrative_workunit_reaper, + RECEPTOR_ACTIVE_STATES, +) from awx.main.consumers import emit_channel_notification from awx.main import analytics from awx.conf import settings_registry @@ -651,8 +659,31 @@ def awx_receptor_workunit_reaper(): jobs_with_unreleased_receptor_units = UnifiedJob.objects.filter(work_unit_id__in=unit_ids).exclude(status__in=ACTIVE_STATES) for job in jobs_with_unreleased_receptor_units: logger.debug(f"{job.log_format} is not active, reaping receptor work unit {job.work_unit_id}") - receptor_ctl.simple_command(f"work cancel {job.work_unit_id}") - receptor_ctl.simple_command(f"work release {job.work_unit_id}") + try: + receptor_ctl.simple_command(f"work cancel {job.work_unit_id}") + receptor_ctl.simple_command(f"work release {job.work_unit_id}") + except RuntimeError as exc: + if 'unknown work unit' not in str(exc): + logger.exception(f'Unexpected error releasing work unit {job.work_unit_id}') + + active_unit_ids = [id for (id, data) in receptor_work_list.items() if data.get('StateName') in RECEPTOR_ACTIVE_STATES] + + jobs_without_active_work_unit = UnifiedJob.objects.filter(status__in=ACTIVE_STATES, controller_node=settings.CLUSTER_HOST_ID).exclude( + work_unit_id__in=active_unit_ids + ) + + for job in jobs_without_active_work_unit: + # TODO: skip if events came in recently + job.cancel_flag = True + job.status = 'error' + if job.work_unit_id in receptor_work_list: + receptor_status = receptor_work_list[job.work_unit_id]['StateName'] + logger.warn(f'Canceling {job.log_format} because of inactive work unit, data:\n{receptor_work_list[job.work_unit_id]}') + job.job_explanation = f'Canceled by reaper because receptor work unit reported a status of {receptor_status}' + else: + logger.warn(f'Canceling {job.log_format} because of missing work unit {job.work_unit_id}') + job.job_explanation = f'Canceled by reaper because receptor work unit could not be found for this job' + job.save(update_fields=['cancel_flag', 'status', 'job_explanation']) administrative_workunit_reaper(receptor_work_list) @@ -3074,7 +3105,7 @@ class AWXReceptorJob: try: receptor_ctl.simple_command(f"work release {self.unit_id}") except RuntimeError: - logger.exception(f"Unable to release work item {self.unit_id}") + logger.exception(f"Unable to release work item {self.unit_id} from {self.task.instance.log_format}") # If an error occured without the job itself failing, it could be a broken instance if self.work_type == 'ansible-runner' and ((res is None) or (getattr(res, 'rc', None) is None)): execution_node_health_check.delay(self.task.instance.execution_node) @@ -3229,24 +3260,16 @@ class AWXReceptorJob: @cleanup_new_process def cancel_watcher(self, processor_future): - receptor_ctl = get_receptor_ctl() while True: if processor_future.done(): return processor_future.result() - # cancel job if receptor no longer knows about work item - try: - receptor_ctl.simple_command(f'work status {self.unit_id}') - except RuntimeError: - self.task.instance.result_traceback = traceback.format_exc() - self.task.instance.save(update_fields=['result_traceback']) - - result = namedtuple('result', ['status', 'rc']) - return result('error', 1) - if self.task.cancel_callback(): + status = self.task.instance.status + if status == 'running': + status = 'canceled' result = namedtuple('result', ['status', 'rc']) - return result('canceled', 1) + return result(status, 1) time.sleep(1) From 55059b015f45900a10cb6a5102ee554e80222fc4 Mon Sep 17 00:00:00 2001 From: Alan Rominger Date: Mon, 18 Oct 2021 15:32:06 -0400 Subject: [PATCH 22/45] Avoid resultsock shutdown before reading from it --- awx/main/tasks.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/awx/main/tasks.py b/awx/main/tasks.py index fd032653e2..b05e9ad719 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -3177,10 +3177,6 @@ class AWXReceptorJob: self.task.update_model(self.task.instance.pk, status='pending') return - # if we did not exceed the quota, continue with shutting down the job - resultsock.shutdown(socket.SHUT_RDWR) - resultfile.close() - # If ansible-runner ran, but an error occured at runtime, the traceback information # is saved via the status_handler passed in to the processor. if state_name == 'Succeeded': @@ -3193,8 +3189,13 @@ class AWXReceptorJob: self.task.instance.result_traceback = b"".join(lines).decode() self.task.instance.save(update_fields=['result_traceback']) except Exception: + resultsock.shutdown(socket.SHUT_RDWR) + resultfile.close() raise RuntimeError(detail) + resultsock.shutdown(socket.SHUT_RDWR) + resultfile.close() + time.sleep(3) return res From 47e67481b3159e4b80bef801637e243141bc1c1e Mon Sep 17 00:00:00 2001 From: Alan Rominger Date: Mon, 18 Oct 2021 15:35:23 -0400 Subject: [PATCH 23/45] Avoid reaping tentative jobs --- awx/main/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/awx/main/tasks.py b/awx/main/tasks.py index b05e9ad719..b83c440d09 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -667,13 +667,13 @@ def awx_receptor_workunit_reaper(): logger.exception(f'Unexpected error releasing work unit {job.work_unit_id}') active_unit_ids = [id for (id, data) in receptor_work_list.items() if data.get('StateName') in RECEPTOR_ACTIVE_STATES] + active_unit_ids.append('') # exclude jobs that have not yet started a receptor work unit jobs_without_active_work_unit = UnifiedJob.objects.filter(status__in=ACTIVE_STATES, controller_node=settings.CLUSTER_HOST_ID).exclude( work_unit_id__in=active_unit_ids ) for job in jobs_without_active_work_unit: - # TODO: skip if events came in recently job.cancel_flag = True job.status = 'error' if job.work_unit_id in receptor_work_list: From 2839091b2274317345f2048f48ea299e90c4fb52 Mon Sep 17 00:00:00 2001 From: Alan Rominger Date: Mon, 18 Oct 2021 16:07:36 -0400 Subject: [PATCH 24/45] Avoid extra check if we have job_explanation --- awx/main/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/awx/main/tasks.py b/awx/main/tasks.py index b83c440d09..7de5902121 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -3182,7 +3182,7 @@ class AWXReceptorJob: if state_name == 'Succeeded': return res - if not self.task.instance.result_traceback: + if not (self.task.instance.result_traceback or self.task.instance.job_explanation): try: resultsock = receptor_ctl.get_work_results(self.unit_id, return_sockfile=True) lines = resultsock.readlines() From 231fcc8178e875c42584126492f425d5fdb1cfe5 Mon Sep 17 00:00:00 2001 From: Jim Ladd Date: Wed, 20 Oct 2021 15:13:01 -0700 Subject: [PATCH 25/45] drop lines picked up during merge resolution --- awx/main/tasks.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/awx/main/tasks.py b/awx/main/tasks.py index 7de5902121..dc58c14622 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -3106,9 +3106,6 @@ class AWXReceptorJob: receptor_ctl.simple_command(f"work release {self.unit_id}") except RuntimeError: logger.exception(f"Unable to release work item {self.unit_id} from {self.task.instance.log_format}") - # If an error occured without the job itself failing, it could be a broken instance - if self.work_type == 'ansible-runner' and ((res is None) or (getattr(res, 'rc', None) is None)): - execution_node_health_check.delay(self.task.instance.execution_node) @property def sign_work(self): From 0f77ca605d8c67169638b8e135bcde046abe3675 Mon Sep 17 00:00:00 2001 From: Jim Ladd Date: Wed, 20 Oct 2021 15:50:56 -0700 Subject: [PATCH 26/45] add unit tests --- awx/main/tasks.py | 2 ++ awx/main/tests/unit/test_tasks.py | 48 +++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/awx/main/tasks.py b/awx/main/tasks.py index dc58c14622..10a9489d0f 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -655,6 +655,7 @@ def awx_receptor_workunit_reaper(): receptor_ctl = get_receptor_ctl() receptor_work_list = receptor_ctl.simple_command("work list") + # Release work units for inactive jobs unit_ids = [id for id in receptor_work_list] jobs_with_unreleased_receptor_units = UnifiedJob.objects.filter(work_unit_id__in=unit_ids).exclude(status__in=ACTIVE_STATES) for job in jobs_with_unreleased_receptor_units: @@ -666,6 +667,7 @@ def awx_receptor_workunit_reaper(): if 'unknown work unit' not in str(exc): logger.exception(f'Unexpected error releasing work unit {job.work_unit_id}') + # Cancel jobs missing active work units active_unit_ids = [id for (id, data) in receptor_work_list.items() if data.get('StateName') in RECEPTOR_ACTIVE_STATES] active_unit_ids.append('') # exclude jobs that have not yet started a receptor work unit diff --git a/awx/main/tests/unit/test_tasks.py b/awx/main/tests/unit/test_tasks.py index 15aeb86504..b17e3b3a00 100644 --- a/awx/main/tests/unit/test_tasks.py +++ b/awx/main/tests/unit/test_tasks.py @@ -1990,3 +1990,51 @@ def test_project_update_no_ee(): task.build_env(job, {}) assert 'The project could not sync because there is no Execution Environment' in str(e.value) + + +@mock.patch('awx.main.tasks.administrative_workunit_reaper') +@mock.patch('awx.main.tasks.UnifiedJob.objects.filter') +@mock.patch('awx.main.tasks.get_receptor_ctl') +def test_awx_receptor_workunit_reaper_cancels_work_units_missing_active_jobs( + mock_get_receptor_ctl, mock_uj_objects, mock_administrative_workunit_reaper, mocker +): + mock_receptor_ctl = mocker.MagicMock() + mock_receptor_ctl.simple_command.return_value = { + '4PiSBdGz': {'Detail': 'Running: PID: 134', 'ExtraData': None, 'State': 1, 'StateName': 'Running', 'StdoutSize': 106, 'WorkType': 'ansible-runner'} + } + mock_get_receptor_ctl.return_value = mock_receptor_ctl + + filtered_objects = mocker.MagicMock() + mock_uj_objects.return_value = filtered_objects + + mock_finished_job = mocker.MagicMock(spec=UnifiedJob) + filtered_objects.exclude.side_effect = [[mock_finished_job], []] + + tasks.awx_receptor_workunit_reaper() + assert mock_receptor_ctl.simple_command.called_with('work cancel 4PiSBdGz') + assert mock_receptor_ctl.simple_command.called_with('work release 4PiSBdGz') + + assert mock_administrative_workunit_reaper.called + + +@mock.patch('awx.main.tasks.administrative_workunit_reaper') +@mock.patch('awx.main.tasks.UnifiedJob.objects.filter') +@mock.patch('awx.main.tasks.get_receptor_ctl') +def test_awx_receptor_workunit_reaper_reaps_jobs_missing_work_units(mock_get_receptor_ctl, mock_uj_objects, mock_administrative_workunit_reaper, mocker): + mock_receptor_ctl = mocker.MagicMock() + mock_receptor_ctl.simple_command.return_value = {} + mock_get_receptor_ctl.return_value = mock_receptor_ctl + + filtered_objects = mocker.MagicMock() + mock_uj_objects.return_value = filtered_objects + + mock_job = mocker.MagicMock(spec=UnifiedJob) + filtered_objects.exclude.side_effect = [[], [mock_job]] + + tasks.awx_receptor_workunit_reaper() + assert mock_job.cancel_flag == True + assert mock_job.status == 'error' + assert mock_job.job_explanation == 'Canceled by reaper because receptor work unit could not be found for this job' + assert mock_job.save.called_once_with(update_fields=['cancel_flag', 'status', 'job_explanation']) + + assert mock_administrative_workunit_reaper.called From 8e9fc14b0e5b29f8adfee3dc8938456280a0b32b Mon Sep 17 00:00:00 2001 From: nixocio Date: Wed, 20 Oct 2021 09:59:57 -0400 Subject: [PATCH 27/45] Fix SAML variables default values Fix SAML variables default values See: https://github.com/ansible/tower/issues/5372 --- .../AzureAD/AzureADEdit/AzureADEdit.js | 3 +-- .../GitHub/GitHubDetail/GitHubDetail.test.js | 24 +++++++++++-------- .../Setting/GitHub/GitHubEdit/GitHubEdit.js | 3 +-- .../GitHubEnterpriseEdit.js | 3 +-- .../GitHubEnterpriseEdit.test.js | 2 +- .../GitHubEnterpriseOrgEdit.js | 3 +-- .../GitHubEnterpriseOrgEdit.test.js | 2 +- .../GitHubEnterpriseTeamEdit.js | 3 +-- .../GitHubEnterpriseTeamEdit.test.js | 2 +- .../GitHub/GitHubOrgEdit/GitHubOrgEdit.js | 3 +-- .../GitHubOrgEdit/GitHubOrgEdit.test.js | 2 +- .../GitHub/GitHubTeamEdit/GitHubTeamEdit.js | 3 +-- .../GoogleOAuth2Edit/GoogleOAuth2Edit.js | 3 +-- .../screens/Setting/Jobs/JobsEdit/JobsEdit.js | 3 +-- .../screens/Setting/LDAP/LDAPEdit/LDAPEdit.js | 3 +-- .../MiscAuthenticationEdit.js | 3 +-- .../MiscAuthenticationEdit.test.js | 6 ++--- .../screens/Setting/SAML/SAMLEdit/SAMLEdit.js | 3 +-- .../screens/Setting/shared/SettingDetail.js | 6 ++--- .../screens/Setting/shared/SharedFields.js | 13 ++++++---- 20 files changed, 44 insertions(+), 49 deletions(-) diff --git a/awx/ui/src/screens/Setting/AzureAD/AzureADEdit/AzureADEdit.js b/awx/ui/src/screens/Setting/AzureAD/AzureADEdit/AzureADEdit.js index cf728fc27d..bc3eadd1a1 100644 --- a/awx/ui/src/screens/Setting/AzureAD/AzureADEdit/AzureADEdit.js +++ b/awx/ui/src/screens/Setting/AzureAD/AzureADEdit/AzureADEdit.js @@ -94,10 +94,9 @@ function AzureADEdit() { const initialValues = (fields) => Object.keys(fields).reduce((acc, key) => { if (fields[key].type === 'list' || fields[key].type === 'nested object') { - const emptyDefault = fields[key].type === 'list' ? '[]' : '{}'; acc[key] = fields[key].value ? JSON.stringify(fields[key].value, null, 2) - : emptyDefault; + : null; } else { acc[key] = fields[key].value ?? ''; } diff --git a/awx/ui/src/screens/Setting/GitHub/GitHubDetail/GitHubDetail.test.js b/awx/ui/src/screens/Setting/GitHub/GitHubDetail/GitHubDetail.test.js index 3991192ba2..256f144459 100644 --- a/awx/ui/src/screens/Setting/GitHub/GitHubDetail/GitHubDetail.test.js +++ b/awx/ui/src/screens/Setting/GitHub/GitHubDetail/GitHubDetail.test.js @@ -147,8 +147,8 @@ describe('', () => { ); assertDetail(wrapper, 'GitHub OAuth2 Key', 'mock github key'); assertDetail(wrapper, 'GitHub OAuth2 Secret', 'Encrypted'); - assertVariableDetail(wrapper, 'GitHub OAuth2 Organization Map', '{}'); - assertVariableDetail(wrapper, 'GitHub OAuth2 Team Map', '{}'); + assertVariableDetail(wrapper, 'GitHub OAuth2 Organization Map', 'null'); + assertVariableDetail(wrapper, 'GitHub OAuth2 Team Map', 'null'); }); test('should hide edit button from non-superusers', async () => { @@ -226,12 +226,12 @@ describe('', () => { assertVariableDetail( wrapper, 'GitHub Organization OAuth2 Organization Map', - '{}' + 'null' ); assertVariableDetail( wrapper, 'GitHub Organization OAuth2 Team Map', - '{}' + 'null' ); }); }); @@ -333,9 +333,13 @@ describe('', () => { assertVariableDetail( wrapper, 'GitHub Enterprise OAuth2 Organization Map', - '{}' + 'null' + ); + assertVariableDetail( + wrapper, + 'GitHub Enterprise OAuth2 Team Map', + 'null' ); - assertVariableDetail(wrapper, 'GitHub Enterprise OAuth2 Team Map', '{}'); }); }); @@ -398,12 +402,12 @@ describe('', () => { assertVariableDetail( wrapper, 'GitHub Enterprise Organization OAuth2 Organization Map', - '{}' + 'null' ); assertVariableDetail( wrapper, 'GitHub Enterprise Organization OAuth2 Team Map', - '{}' + 'null' ); }); }); @@ -463,12 +467,12 @@ describe('', () => { assertVariableDetail( wrapper, 'GitHub Enterprise Team OAuth2 Organization Map', - '{}' + 'null' ); assertVariableDetail( wrapper, 'GitHub Enterprise Team OAuth2 Team Map', - '{}' + 'null' ); }); }); diff --git a/awx/ui/src/screens/Setting/GitHub/GitHubEdit/GitHubEdit.js b/awx/ui/src/screens/Setting/GitHub/GitHubEdit/GitHubEdit.js index 107752bfc6..e936e1f224 100644 --- a/awx/ui/src/screens/Setting/GitHub/GitHubEdit/GitHubEdit.js +++ b/awx/ui/src/screens/Setting/GitHub/GitHubEdit/GitHubEdit.js @@ -92,10 +92,9 @@ function GitHubEdit() { const initialValues = (fields) => Object.keys(fields).reduce((acc, key) => { if (fields[key].type === 'list' || fields[key].type === 'nested object') { - const emptyDefault = fields[key].type === 'list' ? '[]' : '{}'; acc[key] = fields[key].value ? JSON.stringify(fields[key].value, null, 2) - : emptyDefault; + : null; } else { acc[key] = fields[key].value ?? ''; } diff --git a/awx/ui/src/screens/Setting/GitHub/GitHubEnterpriseEdit/GitHubEnterpriseEdit.js b/awx/ui/src/screens/Setting/GitHub/GitHubEnterpriseEdit/GitHubEnterpriseEdit.js index 02b8d5c434..4fa7d941b3 100644 --- a/awx/ui/src/screens/Setting/GitHub/GitHubEnterpriseEdit/GitHubEnterpriseEdit.js +++ b/awx/ui/src/screens/Setting/GitHub/GitHubEnterpriseEdit/GitHubEnterpriseEdit.js @@ -94,10 +94,9 @@ function GitHubEnterpriseEdit() { const initialValues = (fields) => Object.keys(fields).reduce((acc, key) => { if (fields[key].type === 'list' || fields[key].type === 'nested object') { - const emptyDefault = fields[key].type === 'list' ? '[]' : '{}'; acc[key] = fields[key].value ? JSON.stringify(fields[key].value, null, 2) - : emptyDefault; + : null; } else { acc[key] = fields[key].value ?? ''; } diff --git a/awx/ui/src/screens/Setting/GitHub/GitHubEnterpriseEdit/GitHubEnterpriseEdit.test.js b/awx/ui/src/screens/Setting/GitHub/GitHubEnterpriseEdit/GitHubEnterpriseEdit.test.js index 7179fbfa24..0f334e8365 100644 --- a/awx/ui/src/screens/Setting/GitHub/GitHubEnterpriseEdit/GitHubEnterpriseEdit.test.js +++ b/awx/ui/src/screens/Setting/GitHub/GitHubEnterpriseEdit/GitHubEnterpriseEdit.test.js @@ -133,7 +133,7 @@ describe('', () => { SOCIAL_AUTH_GITHUB_ENTERPRISE_API_URL: '', SOCIAL_AUTH_GITHUB_ENTERPRISE_KEY: '', SOCIAL_AUTH_GITHUB_ENTERPRISE_SECRET: '', - SOCIAL_AUTH_GITHUB_ENTERPRISE_TEAM_MAP: {}, + SOCIAL_AUTH_GITHUB_ENTERPRISE_TEAM_MAP: null, SOCIAL_AUTH_GITHUB_ENTERPRISE_ORGANIZATION_MAP: { Default: { users: false, diff --git a/awx/ui/src/screens/Setting/GitHub/GitHubEnterpriseOrgEdit/GitHubEnterpriseOrgEdit.js b/awx/ui/src/screens/Setting/GitHub/GitHubEnterpriseOrgEdit/GitHubEnterpriseOrgEdit.js index d914c46755..0f856558f5 100644 --- a/awx/ui/src/screens/Setting/GitHub/GitHubEnterpriseOrgEdit/GitHubEnterpriseOrgEdit.js +++ b/awx/ui/src/screens/Setting/GitHub/GitHubEnterpriseOrgEdit/GitHubEnterpriseOrgEdit.js @@ -94,10 +94,9 @@ function GitHubEnterpriseOrgEdit() { const initialValues = (fields) => Object.keys(fields).reduce((acc, key) => { if (fields[key].type === 'list' || fields[key].type === 'nested object') { - const emptyDefault = fields[key].type === 'list' ? '[]' : '{}'; acc[key] = fields[key].value ? JSON.stringify(fields[key].value, null, 2) - : emptyDefault; + : null; } else { acc[key] = fields[key].value ?? ''; } diff --git a/awx/ui/src/screens/Setting/GitHub/GitHubEnterpriseOrgEdit/GitHubEnterpriseOrgEdit.test.js b/awx/ui/src/screens/Setting/GitHub/GitHubEnterpriseOrgEdit/GitHubEnterpriseOrgEdit.test.js index 84dc3989dd..b6e55487c8 100644 --- a/awx/ui/src/screens/Setting/GitHub/GitHubEnterpriseOrgEdit/GitHubEnterpriseOrgEdit.test.js +++ b/awx/ui/src/screens/Setting/GitHub/GitHubEnterpriseOrgEdit/GitHubEnterpriseOrgEdit.test.js @@ -146,7 +146,7 @@ describe('', () => { SOCIAL_AUTH_GITHUB_ENTERPRISE_ORG_KEY: '', SOCIAL_AUTH_GITHUB_ENTERPRISE_ORG_SECRET: '', SOCIAL_AUTH_GITHUB_ENTERPRISE_ORG_NAME: '', - SOCIAL_AUTH_GITHUB_ENTERPRISE_ORG_TEAM_MAP: {}, + SOCIAL_AUTH_GITHUB_ENTERPRISE_ORG_TEAM_MAP: null, SOCIAL_AUTH_GITHUB_ENTERPRISE_ORG_ORGANIZATION_MAP: { Default: { users: false, diff --git a/awx/ui/src/screens/Setting/GitHub/GitHubEnterpriseTeamEdit/GitHubEnterpriseTeamEdit.js b/awx/ui/src/screens/Setting/GitHub/GitHubEnterpriseTeamEdit/GitHubEnterpriseTeamEdit.js index 6f638e7e16..8ba8dd176e 100644 --- a/awx/ui/src/screens/Setting/GitHub/GitHubEnterpriseTeamEdit/GitHubEnterpriseTeamEdit.js +++ b/awx/ui/src/screens/Setting/GitHub/GitHubEnterpriseTeamEdit/GitHubEnterpriseTeamEdit.js @@ -94,10 +94,9 @@ function GitHubEnterpriseTeamEdit() { const initialValues = (fields) => Object.keys(fields).reduce((acc, key) => { if (fields[key].type === 'list' || fields[key].type === 'nested object') { - const emptyDefault = fields[key].type === 'list' ? '[]' : '{}'; acc[key] = fields[key].value ? JSON.stringify(fields[key].value, null, 2) - : emptyDefault; + : null; } else { acc[key] = fields[key].value ?? ''; } diff --git a/awx/ui/src/screens/Setting/GitHub/GitHubEnterpriseTeamEdit/GitHubEnterpriseTeamEdit.test.js b/awx/ui/src/screens/Setting/GitHub/GitHubEnterpriseTeamEdit/GitHubEnterpriseTeamEdit.test.js index 6e460e246a..e54c14c1cd 100644 --- a/awx/ui/src/screens/Setting/GitHub/GitHubEnterpriseTeamEdit/GitHubEnterpriseTeamEdit.test.js +++ b/awx/ui/src/screens/Setting/GitHub/GitHubEnterpriseTeamEdit/GitHubEnterpriseTeamEdit.test.js @@ -140,7 +140,7 @@ describe('', () => { SOCIAL_AUTH_GITHUB_ENTERPRISE_TEAM_KEY: '', SOCIAL_AUTH_GITHUB_ENTERPRISE_TEAM_SECRET: '', SOCIAL_AUTH_GITHUB_ENTERPRISE_TEAM_ID: '', - SOCIAL_AUTH_GITHUB_ENTERPRISE_TEAM_TEAM_MAP: {}, + SOCIAL_AUTH_GITHUB_ENTERPRISE_TEAM_TEAM_MAP: null, SOCIAL_AUTH_GITHUB_ENTERPRISE_TEAM_ORGANIZATION_MAP: { Default: { users: false, diff --git a/awx/ui/src/screens/Setting/GitHub/GitHubOrgEdit/GitHubOrgEdit.js b/awx/ui/src/screens/Setting/GitHub/GitHubOrgEdit/GitHubOrgEdit.js index c2e8f83cd6..3b1beab537 100644 --- a/awx/ui/src/screens/Setting/GitHub/GitHubOrgEdit/GitHubOrgEdit.js +++ b/awx/ui/src/screens/Setting/GitHub/GitHubOrgEdit/GitHubOrgEdit.js @@ -94,10 +94,9 @@ function GitHubOrgEdit() { const initialValues = (fields) => Object.keys(fields).reduce((acc, key) => { if (fields[key].type === 'list' || fields[key].type === 'nested object') { - const emptyDefault = fields[key].type === 'list' ? '[]' : '{}'; acc[key] = fields[key].value ? JSON.stringify(fields[key].value, null, 2) - : emptyDefault; + : null; } else { acc[key] = fields[key].value ?? ''; } diff --git a/awx/ui/src/screens/Setting/GitHub/GitHubOrgEdit/GitHubOrgEdit.test.js b/awx/ui/src/screens/Setting/GitHub/GitHubOrgEdit/GitHubOrgEdit.test.js index 777eb698af..f8f99b3d25 100644 --- a/awx/ui/src/screens/Setting/GitHub/GitHubOrgEdit/GitHubOrgEdit.test.js +++ b/awx/ui/src/screens/Setting/GitHub/GitHubOrgEdit/GitHubOrgEdit.test.js @@ -122,7 +122,7 @@ describe('', () => { SOCIAL_AUTH_GITHUB_ORG_KEY: '', SOCIAL_AUTH_GITHUB_ORG_SECRET: '', SOCIAL_AUTH_GITHUB_ORG_NAME: 'new org', - SOCIAL_AUTH_GITHUB_ORG_TEAM_MAP: {}, + SOCIAL_AUTH_GITHUB_ORG_TEAM_MAP: null, SOCIAL_AUTH_GITHUB_ORG_ORGANIZATION_MAP: { Default: { users: false, diff --git a/awx/ui/src/screens/Setting/GitHub/GitHubTeamEdit/GitHubTeamEdit.js b/awx/ui/src/screens/Setting/GitHub/GitHubTeamEdit/GitHubTeamEdit.js index b4506c667b..d12d6baae7 100644 --- a/awx/ui/src/screens/Setting/GitHub/GitHubTeamEdit/GitHubTeamEdit.js +++ b/awx/ui/src/screens/Setting/GitHub/GitHubTeamEdit/GitHubTeamEdit.js @@ -94,10 +94,9 @@ function GitHubTeamEdit() { const initialValues = (fields) => Object.keys(fields).reduce((acc, key) => { if (fields[key].type === 'list' || fields[key].type === 'nested object') { - const emptyDefault = fields[key].type === 'list' ? '[]' : '{}'; acc[key] = fields[key].value ? JSON.stringify(fields[key].value, null, 2) - : emptyDefault; + : null; } else { acc[key] = fields[key].value ?? ''; } diff --git a/awx/ui/src/screens/Setting/GoogleOAuth2/GoogleOAuth2Edit/GoogleOAuth2Edit.js b/awx/ui/src/screens/Setting/GoogleOAuth2/GoogleOAuth2Edit/GoogleOAuth2Edit.js index 8da9e8f583..3fa679beeb 100644 --- a/awx/ui/src/screens/Setting/GoogleOAuth2/GoogleOAuth2Edit/GoogleOAuth2Edit.js +++ b/awx/ui/src/screens/Setting/GoogleOAuth2/GoogleOAuth2Edit/GoogleOAuth2Edit.js @@ -100,10 +100,9 @@ function GoogleOAuth2Edit() { const initialValues = (fields) => Object.keys(fields).reduce((acc, key) => { if (fields[key].type === 'list' || fields[key].type === 'nested object') { - const emptyDefault = fields[key].type === 'list' ? '[]' : '{}'; acc[key] = fields[key].value ? JSON.stringify(fields[key].value, null, 2) - : emptyDefault; + : null; } else { acc[key] = fields[key].value ?? ''; } diff --git a/awx/ui/src/screens/Setting/Jobs/JobsEdit/JobsEdit.js b/awx/ui/src/screens/Setting/Jobs/JobsEdit/JobsEdit.js index 1e6d8cef73..fec8d6cdb8 100644 --- a/awx/ui/src/screens/Setting/Jobs/JobsEdit/JobsEdit.js +++ b/awx/ui/src/screens/Setting/Jobs/JobsEdit/JobsEdit.js @@ -103,10 +103,9 @@ function JobsEdit() { const initialValues = (fields) => Object.keys(fields).reduce((acc, key) => { if (fields[key].type === 'list' || fields[key].type === 'nested object') { - const emptyDefault = fields[key].type === 'list' ? '[]' : '{}'; acc[key] = fields[key].value ? JSON.stringify(fields[key].value, null, 2) - : emptyDefault; + : null; } else { acc[key] = fields[key].value ?? ''; } diff --git a/awx/ui/src/screens/Setting/LDAP/LDAPEdit/LDAPEdit.js b/awx/ui/src/screens/Setting/LDAP/LDAPEdit/LDAPEdit.js index 22929f65b9..22e00db54e 100644 --- a/awx/ui/src/screens/Setting/LDAP/LDAPEdit/LDAPEdit.js +++ b/awx/ui/src/screens/Setting/LDAP/LDAPEdit/LDAPEdit.js @@ -146,10 +146,9 @@ function LDAPEdit() { const initialValues = (fields) => Object.keys(fields).reduce((acc, key) => { if (fields[key].type === 'list' || fields[key].type === 'nested object') { - const emptyDefault = fields[key].type === 'list' ? '[]' : '{}'; acc[key] = fields[key].value ? JSON.stringify(fields[key].value, null, 2) - : emptyDefault; + : null; } else { acc[key] = fields[key].value ?? ''; } diff --git a/awx/ui/src/screens/Setting/MiscAuthentication/MiscAuthenticationEdit/MiscAuthenticationEdit.js b/awx/ui/src/screens/Setting/MiscAuthentication/MiscAuthenticationEdit/MiscAuthenticationEdit.js index 6fb9f6c919..f19807e842 100644 --- a/awx/ui/src/screens/Setting/MiscAuthentication/MiscAuthenticationEdit/MiscAuthenticationEdit.js +++ b/awx/ui/src/screens/Setting/MiscAuthentication/MiscAuthenticationEdit/MiscAuthenticationEdit.js @@ -164,10 +164,9 @@ function MiscAuthenticationEdit() { const initialValues = (fields) => Object.keys(fields).reduce((acc, key) => { if (fields[key].type === 'list' || fields[key].type === 'nested object') { - const emptyDefault = fields[key].type === 'list' ? '[]' : '{}'; acc[key] = fields[key].value ? JSON.stringify(fields[key].value, null, 2) - : emptyDefault; + : null; } else { acc[key] = fields[key].value ?? ''; } diff --git a/awx/ui/src/screens/Setting/MiscAuthentication/MiscAuthenticationEdit/MiscAuthenticationEdit.test.js b/awx/ui/src/screens/Setting/MiscAuthentication/MiscAuthenticationEdit/MiscAuthenticationEdit.test.js index b3cbd31db2..cf00ea5716 100644 --- a/awx/ui/src/screens/Setting/MiscAuthentication/MiscAuthenticationEdit/MiscAuthenticationEdit.test.js +++ b/awx/ui/src/screens/Setting/MiscAuthentication/MiscAuthenticationEdit/MiscAuthenticationEdit.test.js @@ -29,9 +29,9 @@ const authenticationData = { 'awx.sso.backends.TACACSPlusBackend', 'awx.main.backends.AWXModelBackend', ], - SOCIAL_AUTH_ORGANIZATION_MAP: {}, - SOCIAL_AUTH_TEAM_MAP: {}, - SOCIAL_AUTH_USER_FIELDS: [], + SOCIAL_AUTH_ORGANIZATION_MAP: null, + SOCIAL_AUTH_TEAM_MAP: null, + SOCIAL_AUTH_USER_FIELDS: null, }; describe('', () => { diff --git a/awx/ui/src/screens/Setting/SAML/SAMLEdit/SAMLEdit.js b/awx/ui/src/screens/Setting/SAML/SAMLEdit/SAMLEdit.js index 46db9eda90..17202503a2 100644 --- a/awx/ui/src/screens/Setting/SAML/SAMLEdit/SAMLEdit.js +++ b/awx/ui/src/screens/Setting/SAML/SAMLEdit/SAMLEdit.js @@ -112,10 +112,9 @@ function SAMLEdit() { const initialValues = (fields) => Object.keys(fields).reduce((acc, key) => { if (fields[key].type === 'list' || fields[key].type === 'nested object') { - const emptyDefault = fields[key].type === 'list' ? '[]' : '{}'; acc[key] = fields[key].value ? JSON.stringify(fields[key].value, null, 2) - : emptyDefault; + : null; } else { acc[key] = fields[key].value ?? ''; } diff --git a/awx/ui/src/screens/Setting/shared/SettingDetail.js b/awx/ui/src/screens/Setting/shared/SettingDetail.js index dca91d0cef..c133bfbe06 100644 --- a/awx/ui/src/screens/Setting/shared/SettingDetail.js +++ b/awx/ui/src/screens/Setting/shared/SettingDetail.js @@ -5,7 +5,7 @@ import { Detail } from 'components/DetailList'; import CodeDetail from 'components/DetailList/CodeDetail'; function sortObj(obj) { - if (typeof obj !== 'object' || Array.isArray(obj)) { + if (typeof obj !== 'object' || Array.isArray(obj) || obj === null) { return obj; } const sorted = {}; @@ -30,7 +30,7 @@ export default ({ helpText, id, label, type, unit = '', value }) => { label={label} mode="javascript" rows={4} - value={JSON.stringify(sortObj(value || {}), undefined, 2)} + value={JSON.stringify(sortObj(value), undefined, 2)} /> ); break; @@ -42,7 +42,7 @@ export default ({ helpText, id, label, type, unit = '', value }) => { label={label} mode="javascript" rows={4} - value={JSON.stringify(value || [], undefined, 2)} + value={JSON.stringify(value, undefined, 2)} /> ); break; diff --git a/awx/ui/src/screens/Setting/shared/SharedFields.js b/awx/ui/src/screens/Setting/shared/SharedFields.js index 887547186b..d085761a93 100644 --- a/awx/ui/src/screens/Setting/shared/SharedFields.js +++ b/awx/ui/src/screens/Setting/shared/SharedFields.js @@ -440,10 +440,8 @@ const ObjectField = ({ name, config, isRequired = false }) => { const [field, meta, helpers] = useField({ name, validate }); const isValid = !(meta.touched && meta.error); - const emptyDefault = config?.type === 'list' ? '[]' : '{}'; - const defaultRevertValue = config?.default - ? JSON.stringify(config.default, null, 2) - : emptyDefault; + const defaultRevertValue = + config?.default !== null ? JSON.stringify(config.default, null, 2) : null; return config ? ( @@ -458,7 +456,12 @@ const ObjectField = ({ name, config, isRequired = false }) => { > { From ecb84e090cb595c6d776d3f24f68cb69d54f8648 Mon Sep 17 00:00:00 2001 From: Alan Rominger Date: Mon, 25 Oct 2021 10:16:56 -0400 Subject: [PATCH 28/45] Revert "Merge pull request #5354 from ansible/jobs_killed_via_receptor_should_get_reaped" This reverts commit 8736858d808402d4520112153414bd875cf5cfe7, reversing changes made to 84e77c9db9f8ec2a781be8da834256b350e865f2. --- awx/main/tasks.py | 61 ++++--------------------------- awx/main/tests/unit/test_tasks.py | 48 ------------------------ 2 files changed, 8 insertions(+), 101 deletions(-) diff --git a/awx/main/tasks.py b/awx/main/tasks.py index 10a9489d0f..90b5672241 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -108,15 +108,7 @@ from awx.main.utils.safe_yaml import safe_dump, sanitize_jinja from awx.main.utils.reload import stop_local_services from awx.main.utils.pglock import advisory_lock from awx.main.utils.handlers import SpecialInventoryHandler -from awx.main.utils.receptor import ( - get_receptor_ctl, - worker_info, - get_conn_type, - get_tls_client, - worker_cleanup, - administrative_workunit_reaper, - RECEPTOR_ACTIVE_STATES, -) +from awx.main.utils.receptor import get_receptor_ctl, worker_info, get_conn_type, get_tls_client, worker_cleanup, administrative_workunit_reaper from awx.main.consumers import emit_channel_notification from awx.main import analytics from awx.conf import settings_registry @@ -655,37 +647,12 @@ def awx_receptor_workunit_reaper(): receptor_ctl = get_receptor_ctl() receptor_work_list = receptor_ctl.simple_command("work list") - # Release work units for inactive jobs unit_ids = [id for id in receptor_work_list] jobs_with_unreleased_receptor_units = UnifiedJob.objects.filter(work_unit_id__in=unit_ids).exclude(status__in=ACTIVE_STATES) for job in jobs_with_unreleased_receptor_units: logger.debug(f"{job.log_format} is not active, reaping receptor work unit {job.work_unit_id}") - try: - receptor_ctl.simple_command(f"work cancel {job.work_unit_id}") - receptor_ctl.simple_command(f"work release {job.work_unit_id}") - except RuntimeError as exc: - if 'unknown work unit' not in str(exc): - logger.exception(f'Unexpected error releasing work unit {job.work_unit_id}') - - # Cancel jobs missing active work units - active_unit_ids = [id for (id, data) in receptor_work_list.items() if data.get('StateName') in RECEPTOR_ACTIVE_STATES] - active_unit_ids.append('') # exclude jobs that have not yet started a receptor work unit - - jobs_without_active_work_unit = UnifiedJob.objects.filter(status__in=ACTIVE_STATES, controller_node=settings.CLUSTER_HOST_ID).exclude( - work_unit_id__in=active_unit_ids - ) - - for job in jobs_without_active_work_unit: - job.cancel_flag = True - job.status = 'error' - if job.work_unit_id in receptor_work_list: - receptor_status = receptor_work_list[job.work_unit_id]['StateName'] - logger.warn(f'Canceling {job.log_format} because of inactive work unit, data:\n{receptor_work_list[job.work_unit_id]}') - job.job_explanation = f'Canceled by reaper because receptor work unit reported a status of {receptor_status}' - else: - logger.warn(f'Canceling {job.log_format} because of missing work unit {job.work_unit_id}') - job.job_explanation = f'Canceled by reaper because receptor work unit could not be found for this job' - job.save(update_fields=['cancel_flag', 'status', 'job_explanation']) + receptor_ctl.simple_command(f"work cancel {job.work_unit_id}") + receptor_ctl.simple_command(f"work release {job.work_unit_id}") administrative_workunit_reaper(receptor_work_list) @@ -3104,10 +3071,7 @@ class AWXReceptorJob: finally: # Make sure to always release the work unit if we established it if self.unit_id is not None and settings.RECEPTOR_RELEASE_WORK: - try: - receptor_ctl.simple_command(f"work release {self.unit_id}") - except RuntimeError: - logger.exception(f"Unable to release work item {self.unit_id} from {self.task.instance.log_format}") + receptor_ctl.simple_command(f"work release {self.unit_id}") @property def sign_work(self): @@ -3164,10 +3128,10 @@ class AWXReceptorJob: unit_status = receptor_ctl.simple_command(f'work status {self.unit_id}') detail = unit_status.get('Detail', None) state_name = unit_status.get('StateName', None) - except RuntimeError: + except RuntimeError as e: detail = '' state_name = '' - logger.exception(f'Unable to retrieve work status for {self.unit_id}') + logger.exception(e) if 'exceeded quota' in detail: logger.warn(detail) @@ -3175,26 +3139,20 @@ class AWXReceptorJob: logger.warn(f"Could not launch pod for {log_name}. Exceeded quota.") self.task.update_model(self.task.instance.pk, status='pending') return - # If ansible-runner ran, but an error occured at runtime, the traceback information # is saved via the status_handler passed in to the processor. if state_name == 'Succeeded': return res - if not (self.task.instance.result_traceback or self.task.instance.job_explanation): + if not self.task.instance.result_traceback: try: resultsock = receptor_ctl.get_work_results(self.unit_id, return_sockfile=True) lines = resultsock.readlines() self.task.instance.result_traceback = b"".join(lines).decode() self.task.instance.save(update_fields=['result_traceback']) except Exception: - resultsock.shutdown(socket.SHUT_RDWR) - resultfile.close() raise RuntimeError(detail) - resultsock.shutdown(socket.SHUT_RDWR) - resultfile.close() - time.sleep(3) return res @@ -3265,11 +3223,8 @@ class AWXReceptorJob: return processor_future.result() if self.task.cancel_callback(): - status = self.task.instance.status - if status == 'running': - status = 'canceled' result = namedtuple('result', ['status', 'rc']) - return result(status, 1) + return result('canceled', 1) time.sleep(1) diff --git a/awx/main/tests/unit/test_tasks.py b/awx/main/tests/unit/test_tasks.py index b17e3b3a00..15aeb86504 100644 --- a/awx/main/tests/unit/test_tasks.py +++ b/awx/main/tests/unit/test_tasks.py @@ -1990,51 +1990,3 @@ def test_project_update_no_ee(): task.build_env(job, {}) assert 'The project could not sync because there is no Execution Environment' in str(e.value) - - -@mock.patch('awx.main.tasks.administrative_workunit_reaper') -@mock.patch('awx.main.tasks.UnifiedJob.objects.filter') -@mock.patch('awx.main.tasks.get_receptor_ctl') -def test_awx_receptor_workunit_reaper_cancels_work_units_missing_active_jobs( - mock_get_receptor_ctl, mock_uj_objects, mock_administrative_workunit_reaper, mocker -): - mock_receptor_ctl = mocker.MagicMock() - mock_receptor_ctl.simple_command.return_value = { - '4PiSBdGz': {'Detail': 'Running: PID: 134', 'ExtraData': None, 'State': 1, 'StateName': 'Running', 'StdoutSize': 106, 'WorkType': 'ansible-runner'} - } - mock_get_receptor_ctl.return_value = mock_receptor_ctl - - filtered_objects = mocker.MagicMock() - mock_uj_objects.return_value = filtered_objects - - mock_finished_job = mocker.MagicMock(spec=UnifiedJob) - filtered_objects.exclude.side_effect = [[mock_finished_job], []] - - tasks.awx_receptor_workunit_reaper() - assert mock_receptor_ctl.simple_command.called_with('work cancel 4PiSBdGz') - assert mock_receptor_ctl.simple_command.called_with('work release 4PiSBdGz') - - assert mock_administrative_workunit_reaper.called - - -@mock.patch('awx.main.tasks.administrative_workunit_reaper') -@mock.patch('awx.main.tasks.UnifiedJob.objects.filter') -@mock.patch('awx.main.tasks.get_receptor_ctl') -def test_awx_receptor_workunit_reaper_reaps_jobs_missing_work_units(mock_get_receptor_ctl, mock_uj_objects, mock_administrative_workunit_reaper, mocker): - mock_receptor_ctl = mocker.MagicMock() - mock_receptor_ctl.simple_command.return_value = {} - mock_get_receptor_ctl.return_value = mock_receptor_ctl - - filtered_objects = mocker.MagicMock() - mock_uj_objects.return_value = filtered_objects - - mock_job = mocker.MagicMock(spec=UnifiedJob) - filtered_objects.exclude.side_effect = [[], [mock_job]] - - tasks.awx_receptor_workunit_reaper() - assert mock_job.cancel_flag == True - assert mock_job.status == 'error' - assert mock_job.job_explanation == 'Canceled by reaper because receptor work unit could not be found for this job' - assert mock_job.save.called_once_with(update_fields=['cancel_flag', 'status', 'job_explanation']) - - assert mock_administrative_workunit_reaper.called From 329caad6818fe3c908ceede21234b356a08638e7 Mon Sep 17 00:00:00 2001 From: Alan Rominger Date: Thu, 21 Oct 2021 15:38:03 -0400 Subject: [PATCH 29/45] In admin reaper skip work units w/o params --- awx/main/utils/receptor.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/awx/main/utils/receptor.py b/awx/main/utils/receptor.py index 19d76afd90..e1961ca905 100644 --- a/awx/main/utils/receptor.py +++ b/awx/main/utils/receptor.py @@ -86,6 +86,8 @@ def administrative_workunit_reaper(work_list=None): if (extra_data is None) or (extra_data.get('RemoteWorkType') != 'ansible-runner'): continue # if this is not ansible-runner work, we do not want to touch it params = extra_data.get('RemoteParams', {}).get('params') + if not params: + continue if not (params == '--worker-info' or params.startswith('cleanup')): continue # if this is not a cleanup or health check, we do not want to touch it if work_data.get('StateName') in RECEPTOR_ACTIVE_STATES: From a9636426b8408edda6c946cc5fdb6d91da8f5076 Mon Sep 17 00:00:00 2001 From: Jeff Bradberry Date: Mon, 25 Oct 2021 16:48:04 -0400 Subject: [PATCH 30/45] Make the inventory_import command respect the default EE and credential --- .../management/commands/inventory_import.py | 33 +++++++++++++++---- 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/awx/main/management/commands/inventory_import.py b/awx/main/management/commands/inventory_import.py index 8f524c1f05..43c6671bda 100644 --- a/awx/main/management/commands/inventory_import.py +++ b/awx/main/management/commands/inventory_import.py @@ -2,12 +2,15 @@ # All Rights Reserved. # Python +from base64 import b64encode import json import logging import os import re +import stat import subprocess import sys +import tempfile import time import traceback from collections import OrderedDict @@ -76,7 +79,24 @@ class AnsibleInventoryLoader(object): bargs.extend(['-v', '{0}:{0}:Z'.format(self.source)]) for key, value in STANDARD_INVENTORY_UPDATE_ENV.items(): bargs.extend(['-e', '{0}={1}'.format(key, value)]) - bargs.extend([get_default_execution_environment().image]) + ee = get_default_execution_environment() + bargs.extend([ee.image]) + + if ee.cred: + if not ee.cred.has_inputs(field_names=('host', 'username', 'password')): + raise RuntimeError(f"Registry credential for execution environment `{ee}` is missing either the host, username, or password.") + + host = ee.cred.get_input('host') + username = ee.cred.get_input('username') + password = ee.cred.get_input('password') + token = f"{username}:{password}" + auth_data = {'auths': {host: {'auth': b64encode(token.encode('utf-8')).decode('utf-8')}}} + self.authfile.write(json.dumps(auth_data, indent=4).encode('utf-8')) + bargs.append(f'--authfile={self.authfile.name}') + + if ee.pull: + bargs.append(f'--pull={ee.pull}') + bargs.extend(['ansible-inventory', '-i', self.source]) bargs.extend(['--playbook-dir', functioning_dir(self.source)]) if self.verbosity: @@ -110,11 +130,12 @@ class AnsibleInventoryLoader(object): return data def load(self): - base_args = self.get_base_args() - - logger.info('Reading Ansible inventory source: %s', self.source) - - return self.command_to_json(base_args) + with tempfile.NamedTemporaryFile() as f: + self.authfile = f + os.chmod(self.authfile.name, stat.S_IRUSR | stat.S_IWUSR) + base_args = self.get_base_args() + logger.info('Reading Ansible inventory source: %s', self.source) + return self.command_to_json(base_args) class Command(BaseCommand): From d79da1ef9fc19dd85cf5a3795f46c2cc1da056d6 Mon Sep 17 00:00:00 2001 From: Bianca Henderson Date: Tue, 26 Oct 2021 16:53:30 -0400 Subject: [PATCH 31/45] Catch exceptions that might pop up when releasing work units --- awx/main/tasks.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/awx/main/tasks.py b/awx/main/tasks.py index 90b5672241..9ea2447126 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -3071,7 +3071,10 @@ class AWXReceptorJob: finally: # Make sure to always release the work unit if we established it if self.unit_id is not None and settings.RECEPTOR_RELEASE_WORK: - receptor_ctl.simple_command(f"work release {self.unit_id}") + try: + receptor_ctl.simple_command(f"work release {self.unit_id}") + except Exception as e: + logger.exception(e) @property def sign_work(self): @@ -3128,7 +3131,7 @@ class AWXReceptorJob: unit_status = receptor_ctl.simple_command(f'work status {self.unit_id}') detail = unit_status.get('Detail', None) state_name = unit_status.get('StateName', None) - except RuntimeError as e: + except Exception as e: detail = '' state_name = '' logger.exception(e) From 8275082896ac8a51ae14f3eac4091c50b7a6e798 Mon Sep 17 00:00:00 2001 From: Bianca Henderson Date: Tue, 26 Oct 2021 17:38:35 -0400 Subject: [PATCH 32/45] Update error messages for when exceptions are caught --- awx/main/tasks.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/awx/main/tasks.py b/awx/main/tasks.py index 9ea2447126..948ceb6c79 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -3073,8 +3073,8 @@ class AWXReceptorJob: if self.unit_id is not None and settings.RECEPTOR_RELEASE_WORK: try: receptor_ctl.simple_command(f"work release {self.unit_id}") - except Exception as e: - logger.exception(e) + except Exception: + logger.exception(f"Error releasing work unit {self.unit_id}.") @property def sign_work(self): @@ -3131,10 +3131,10 @@ class AWXReceptorJob: unit_status = receptor_ctl.simple_command(f'work status {self.unit_id}') detail = unit_status.get('Detail', None) state_name = unit_status.get('StateName', None) - except Exception as e: + except Exception: detail = '' state_name = '' - logger.exception(e) + logger.exception(f'An error was encountered while canceling work unit {self.unit_id}') if 'exceeded quota' in detail: logger.warn(detail) From 58cdbca5cf413e376b7b064800b72ebb5c722e9a Mon Sep 17 00:00:00 2001 From: Bianca Henderson Date: Wed, 27 Oct 2021 08:59:55 -0400 Subject: [PATCH 33/45] Update error message to be more accurate --- awx/main/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/awx/main/tasks.py b/awx/main/tasks.py index 948ceb6c79..a782a8e04e 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -3134,7 +3134,7 @@ class AWXReceptorJob: except Exception: detail = '' state_name = '' - logger.exception(f'An error was encountered while canceling work unit {self.unit_id}') + logger.exception(f'An error was encountered while getting status for work unit {self.unit_id}') if 'exceeded quota' in detail: logger.warn(detail) From 010c3ab0b8a91b3ea4a9973f1a3dbc6e0d4e860d Mon Sep 17 00:00:00 2001 From: Jeff Bradberry Date: Wed, 27 Oct 2021 10:36:20 -0400 Subject: [PATCH 34/45] Fix a typo in inventory_import ExecutionEnvironment.credential got shortened to .cred. --- awx/main/management/commands/inventory_import.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/awx/main/management/commands/inventory_import.py b/awx/main/management/commands/inventory_import.py index 43c6671bda..ddd7f9cd83 100644 --- a/awx/main/management/commands/inventory_import.py +++ b/awx/main/management/commands/inventory_import.py @@ -82,13 +82,13 @@ class AnsibleInventoryLoader(object): ee = get_default_execution_environment() bargs.extend([ee.image]) - if ee.cred: - if not ee.cred.has_inputs(field_names=('host', 'username', 'password')): + if ee.credential: + if not ee.credential.has_inputs(field_names=('host', 'username', 'password')): raise RuntimeError(f"Registry credential for execution environment `{ee}` is missing either the host, username, or password.") - host = ee.cred.get_input('host') - username = ee.cred.get_input('username') - password = ee.cred.get_input('password') + host = ee.credential.get_input('host') + username = ee.credential.get_input('username') + password = ee.credential.get_input('password') token = f"{username}:{password}" auth_data = {'auths': {host: {'auth': b64encode(token.encode('utf-8')).decode('utf-8')}}} self.authfile.write(json.dumps(auth_data, indent=4).encode('utf-8')) From d3c695b853f56d624b47f19a3174ca856f3f0467 Mon Sep 17 00:00:00 2001 From: Jeff Bradberry Date: Wed, 27 Oct 2021 11:28:49 -0400 Subject: [PATCH 35/45] Clean up some scar tissue left behind by the initial use of the black code formatter. --- awx/main/management/commands/inventory_import.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/awx/main/management/commands/inventory_import.py b/awx/main/management/commands/inventory_import.py index ddd7f9cd83..cd99cdef42 100644 --- a/awx/main/management/commands/inventory_import.py +++ b/awx/main/management/commands/inventory_import.py @@ -159,7 +159,7 @@ class Command(BaseCommand): type=str, default=None, metavar='v', - help='host variable used to ' 'set/clear enabled flag when host is online/offline, may ' 'be specified as "foo.bar" to traverse nested dicts.', + help='host variable used to set/clear enabled flag when host is online/offline, may be specified as "foo.bar" to traverse nested dicts.', ) parser.add_argument( '--enabled-value', @@ -167,7 +167,7 @@ class Command(BaseCommand): type=str, default=None, metavar='v', - help='value of host variable ' 'specified by --enabled-var that indicates host is ' 'enabled/online.', + help='value of host variable specified by --enabled-var that indicates host is enabled/online.', ) parser.add_argument( '--group-filter', @@ -175,7 +175,7 @@ class Command(BaseCommand): type=str, default=None, metavar='regex', - help='regular expression ' 'to filter group name(s); only matches are imported.', + help='regular expression to filter group name(s); only matches are imported.', ) parser.add_argument( '--host-filter', @@ -183,14 +183,14 @@ class Command(BaseCommand): type=str, default=None, metavar='regex', - help='regular expression ' 'to filter host name(s); only matches are imported.', + help='regular expression to filter host name(s); only matches are imported.', ) parser.add_argument( '--exclude-empty-groups', dest='exclude_empty_groups', action='store_true', default=False, - help='when set, ' 'exclude all groups that have no child groups, hosts, or ' 'variables.', + help='when set, exclude all groups that have no child groups, hosts, or variables.', ) parser.add_argument( '--instance-id-var', @@ -198,7 +198,7 @@ class Command(BaseCommand): type=str, default=None, metavar='v', - help='host variable that ' 'specifies the unique, immutable instance ID, may be ' 'specified as "foo.bar" to traverse nested dicts.', + help='host variable that specifies the unique, immutable instance ID, may be specified as "foo.bar" to traverse nested dicts.', ) def set_logging_level(self, verbosity): @@ -1038,4 +1038,4 @@ class Command(BaseCommand): if settings.SQL_DEBUG: queries_this_import = connection.queries[queries_before:] sqltime = sum(float(x['time']) for x in queries_this_import) - logger.warning('Inventory import required %d queries ' 'taking %0.3fs', len(queries_this_import), sqltime) + logger.warning('Inventory import required %d queries taking %0.3fs', len(queries_this_import), sqltime) From db7fb818558f1d2e4b267081b5ba5f4f571dd0a6 Mon Sep 17 00:00:00 2001 From: Kersom <9053044+nixocio@users.noreply.github.com> Date: Wed, 27 Oct 2021 16:37:54 -0400 Subject: [PATCH 36/45] Fix login redirect (#5386) Allows the user to visit login page when the login redirect url is set. Also, redirects to login page once logging out and there is session from a SAML available. See: https://github.com/ansible/awx/issues/11012 --- awx/ui/src/App.js | 19 +++++++++++++--- awx/ui/src/App.test.js | 36 ++++++++++++++++++++++++++++++- awx/ui/src/contexts/Session.js | 35 +++++++++++++++++++++++++++--- awx/ui/src/screens/Login/Login.js | 15 ++----------- 4 files changed, 85 insertions(+), 20 deletions(-) diff --git a/awx/ui/src/App.js b/awx/ui/src/App.js index ab0211bd38..674dec8b07 100644 --- a/awx/ui/src/App.js +++ b/awx/ui/src/App.js @@ -98,8 +98,13 @@ const AuthorizedRoutes = ({ routeConfig }) => { ); }; -const ProtectedRoute = ({ children, ...rest }) => { - const { authRedirectTo, setAuthRedirectTo } = useSession(); +export function ProtectedRoute({ children, ...rest }) { + const { + authRedirectTo, + setAuthRedirectTo, + loginRedirectOverride, + isUserBeingLoggedOut, + } = useSession(); const location = useLocation(); useEffect(() => { @@ -120,8 +125,16 @@ const ProtectedRoute = ({ children, ...rest }) => { ); } + if ( + loginRedirectOverride && + !window.location.href.includes('/login') && + !isUserBeingLoggedOut + ) { + window.location.replace(loginRedirectOverride); + return null; + } return ; -}; +} function App() { const history = useHistory(); diff --git a/awx/ui/src/App.test.js b/awx/ui/src/App.test.js index a8b842714a..edcf60ebb7 100644 --- a/awx/ui/src/App.test.js +++ b/awx/ui/src/App.test.js @@ -3,7 +3,7 @@ import { act } from 'react-dom/test-utils'; import { RootAPI } from 'api'; import * as SessionContext from 'contexts/Session'; import { mountWithContexts } from '../testUtils/enzymeHelpers'; -import App from './App'; +import App, { ProtectedRoute } from './App'; jest.mock('./api'); @@ -20,6 +20,8 @@ describe('', () => { const contextValues = { setAuthRedirectTo: jest.fn(), isSessionExpired: false, + isUserBeingLoggedOut: false, + loginRedirectOverride: null, }; jest .spyOn(SessionContext, 'useSession') @@ -32,4 +34,36 @@ describe('', () => { expect(wrapper.length).toBe(1); jest.clearAllMocks(); }); + + test('redirect to login override', async () => { + const { location } = window; + delete window.location; + window.location = { + replace: jest.fn(), + href: '/', + }; + + expect(window.location.replace).not.toHaveBeenCalled(); + + const contextValues = { + setAuthRedirectTo: jest.fn(), + isSessionExpired: false, + isUserBeingLoggedOut: false, + loginRedirectOverride: '/sso/test', + }; + jest + .spyOn(SessionContext, 'useSession') + .mockImplementation(() => contextValues); + + await act(async () => { + mountWithContexts( + +
foo
+
+ ); + }); + + expect(window.location.replace).toHaveBeenCalled(); + window.location = location; + }); }); diff --git a/awx/ui/src/contexts/Session.js b/awx/ui/src/contexts/Session.js index f6db1e3be3..1f76826abf 100644 --- a/awx/ui/src/contexts/Session.js +++ b/awx/ui/src/contexts/Session.js @@ -5,10 +5,11 @@ import React, { useRef, useCallback, } from 'react'; -import { useHistory } from 'react-router-dom'; +import { useHistory, Redirect } from 'react-router-dom'; import { DateTime } from 'luxon'; import { RootAPI, MeAPI } from 'api'; import { isAuthenticated } from 'util/auth'; +import useRequest from 'hooks/useRequest'; import { SESSION_TIMEOUT_KEY } from '../constants'; // The maximum supported timeout for setTimeout(), in milliseconds, @@ -72,8 +73,31 @@ function SessionProvider({ children }) { const [sessionTimeout, setSessionTimeout] = useStorage(SESSION_TIMEOUT_KEY); const [sessionCountdown, setSessionCountdown] = useState(0); const [authRedirectTo, setAuthRedirectTo] = useState('/'); + const [isUserBeingLoggedOut, setIsUserBeingLoggedOut] = useState(false); + + const { + request: fetchLoginRedirectOverride, + result: { loginRedirectOverride }, + isLoading, + } = useRequest( + useCallback(async () => { + const { data } = await RootAPI.read(); + return { + loginRedirectOverride: data?.login_redirect_override, + }; + }, []), + { + loginRedirectOverride: null, + isLoading: true, + } + ); + + useEffect(() => { + fetchLoginRedirectOverride(); + }, [fetchLoginRedirectOverride]); const logout = useCallback(async () => { + setIsUserBeingLoggedOut(true); if (!isSessionExpired.current) { setAuthRedirectTo('/logout'); } @@ -82,14 +106,13 @@ function SessionProvider({ children }) { setSessionCountdown(0); clearTimeout(sessionTimeoutId.current); clearInterval(sessionIntervalId.current); + return ; }, [setSessionTimeout, setSessionCountdown]); useEffect(() => { if (!isAuthenticated(document.cookie)) { - history.replace('/login'); return () => {}; } - const calcRemaining = () => { if (sessionTimeout) { return Math.max( @@ -140,9 +163,15 @@ function SessionProvider({ children }) { clearInterval(sessionIntervalId.current); }, []); + if (isLoading) { + return null; + } + return ( { const [ { - data: { custom_logo, custom_login_info, login_redirect_override }, + data: { custom_logo, custom_login_info }, }, { data: { BRAND_NAME }, @@ -78,7 +72,6 @@ function AWXLogin({ alt, isAuthenticated }) { logo: logoSrc, loginInfo: custom_login_info, socialAuthOptions: authData, - loginRedirectOverride: login_redirect_override, }; }, []), { @@ -118,10 +111,6 @@ function AWXLogin({ alt, isAuthenticated }) { if (isCustomLoginInfoLoading) { return null; } - if (!isAuthenticated(document.cookie) && loginRedirectOverride) { - window.location.replace(loginRedirectOverride); - return null; - } if (isAuthenticated(document.cookie)) { return ; } From d42fe921db858023dc3e4f99add0eb2f31f88539 Mon Sep 17 00:00:00 2001 From: Alan Rominger Date: Thu, 28 Oct 2021 15:25:45 -0400 Subject: [PATCH 37/45] Re-order authfile option to make inventory import command work --- awx/main/management/commands/inventory_import.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/awx/main/management/commands/inventory_import.py b/awx/main/management/commands/inventory_import.py index cd99cdef42..94bcabb2a1 100644 --- a/awx/main/management/commands/inventory_import.py +++ b/awx/main/management/commands/inventory_import.py @@ -80,7 +80,6 @@ class AnsibleInventoryLoader(object): for key, value in STANDARD_INVENTORY_UPDATE_ENV.items(): bargs.extend(['-e', '{0}={1}'.format(key, value)]) ee = get_default_execution_environment() - bargs.extend([ee.image]) if ee.credential: if not ee.credential.has_inputs(field_names=('host', 'username', 'password')): @@ -97,6 +96,8 @@ class AnsibleInventoryLoader(object): if ee.pull: bargs.append(f'--pull={ee.pull}') + bargs.extend([ee.image]) + bargs.extend(['ansible-inventory', '-i', self.source]) bargs.extend(['--playbook-dir', functioning_dir(self.source)]) if self.verbosity: From 0d1f8a06ced47f226fdc5a3b0cb0de4f8fcdd5b2 Mon Sep 17 00:00:00 2001 From: Alan Rominger Date: Fri, 29 Oct 2021 09:27:42 -0400 Subject: [PATCH 38/45] Revert default EE authfile support for inventory_import --- .../management/commands/inventory_import.py | 27 +++---------------- 1 file changed, 3 insertions(+), 24 deletions(-) diff --git a/awx/main/management/commands/inventory_import.py b/awx/main/management/commands/inventory_import.py index 94bcabb2a1..a3c6acdab3 100644 --- a/awx/main/management/commands/inventory_import.py +++ b/awx/main/management/commands/inventory_import.py @@ -2,15 +2,12 @@ # All Rights Reserved. # Python -from base64 import b64encode import json import logging import os import re -import stat import subprocess import sys -import tempfile import time import traceback from collections import OrderedDict @@ -81,21 +78,6 @@ class AnsibleInventoryLoader(object): bargs.extend(['-e', '{0}={1}'.format(key, value)]) ee = get_default_execution_environment() - if ee.credential: - if not ee.credential.has_inputs(field_names=('host', 'username', 'password')): - raise RuntimeError(f"Registry credential for execution environment `{ee}` is missing either the host, username, or password.") - - host = ee.credential.get_input('host') - username = ee.credential.get_input('username') - password = ee.credential.get_input('password') - token = f"{username}:{password}" - auth_data = {'auths': {host: {'auth': b64encode(token.encode('utf-8')).decode('utf-8')}}} - self.authfile.write(json.dumps(auth_data, indent=4).encode('utf-8')) - bargs.append(f'--authfile={self.authfile.name}') - - if ee.pull: - bargs.append(f'--pull={ee.pull}') - bargs.extend([ee.image]) bargs.extend(['ansible-inventory', '-i', self.source]) @@ -131,12 +113,9 @@ class AnsibleInventoryLoader(object): return data def load(self): - with tempfile.NamedTemporaryFile() as f: - self.authfile = f - os.chmod(self.authfile.name, stat.S_IRUSR | stat.S_IWUSR) - base_args = self.get_base_args() - logger.info('Reading Ansible inventory source: %s', self.source) - return self.command_to_json(base_args) + base_args = self.get_base_args() + logger.info('Reading Ansible inventory source: %s', self.source) + return self.command_to_json(base_args) class Command(BaseCommand): From cab8c690d2908799bcbd6b60a3e4c47c23301c90 Mon Sep 17 00:00:00 2001 From: Alex Corey Date: Wed, 27 Oct 2021 15:19:37 -0400 Subject: [PATCH 39/45] Adds instances to aactivty stream --- .../screens/ActivityStream/ActivityStream.js | 4 +++ .../screens/InstanceGroup/InstanceGroups.js | 9 +++++-- .../InstanceGroup/InstanceGroups.test.js | 25 ++++++++++++++++++- 3 files changed, 35 insertions(+), 3 deletions(-) diff --git a/awx/ui/src/screens/ActivityStream/ActivityStream.js b/awx/ui/src/screens/ActivityStream/ActivityStream.js index bb54049772..209da83669 100644 --- a/awx/ui/src/screens/ActivityStream/ActivityStream.js +++ b/awx/ui/src/screens/ActivityStream/ActivityStream.js @@ -188,6 +188,10 @@ function ActivityStream() { > {t`Notification Templates`} + {t`Instances`} {t`Instance Groups`} diff --git a/awx/ui/src/screens/InstanceGroup/InstanceGroups.js b/awx/ui/src/screens/InstanceGroup/InstanceGroups.js index 3f92aa6abe..c4ca94541e 100644 --- a/awx/ui/src/screens/InstanceGroup/InstanceGroups.js +++ b/awx/ui/src/screens/InstanceGroup/InstanceGroups.js @@ -1,7 +1,7 @@ import React, { useCallback, useEffect, useState } from 'react'; import { t } from '@lingui/macro'; -import { Route, Switch } from 'react-router-dom'; +import { Route, Switch, useLocation } from 'react-router-dom'; import useRequest from 'hooks/useRequest'; import { SettingsAPI } from 'api'; @@ -14,6 +14,7 @@ import ContainerGroupAdd from './ContainerGroupAdd'; import ContainerGroup from './ContainerGroup'; function InstanceGroups() { + const { pathname } = useLocation(); const { request: settingsRequest, isLoading: isSettingsRequestLoading, @@ -62,10 +63,14 @@ function InstanceGroups() { }); }, []); + const streamType = pathname.includes('instances') + ? 'instance' + : 'instance_group'; + return ( <> diff --git a/awx/ui/src/screens/InstanceGroup/InstanceGroups.test.js b/awx/ui/src/screens/InstanceGroup/InstanceGroups.test.js index 1a310dbe1b..fdd53c6e8f 100644 --- a/awx/ui/src/screens/InstanceGroup/InstanceGroups.test.js +++ b/awx/ui/src/screens/InstanceGroup/InstanceGroups.test.js @@ -1,10 +1,20 @@ import React from 'react'; import { shallow } from 'enzyme'; - +import { InstanceGroupsAPI } from 'api'; import InstanceGroups from './InstanceGroups'; +const mockUseLocationValue = { + pathname: '', +}; +jest.mock('api'); +jest.mock('react-router-dom', () => ({ + ...jest.requireActual('react-router-dom'), + useLocation: () => mockUseLocationValue, +})); describe('', () => { test('should set breadcrumbs', () => { + mockUseLocationValue.pathname = '/instance_groups'; + const wrapper = shallow(); const header = wrapper.find('ScreenHeader'); @@ -15,4 +25,17 @@ describe('', () => { '/instance_groups/container_group/add': 'Create new container group', }); }); + test('should set breadcrumbs', async () => { + mockUseLocationValue.pathname = '/instance_groups/1/instances'; + InstanceGroupsAPI.readInstances.mockResolvedValue({ + data: { results: [{ hostname: 'EC2', id: 1 }] }, + }); + InstanceGroupsAPI.readInstanceOptions.mockResolvedValue({ + data: { actions: {} }, + }); + + const wrapper = shallow(); + + expect(wrapper.find('ScreenHeader').prop('streamType')).toEqual('instance'); + }); }); From a2acbe9fe62134c5e9314756721954e65fab0bf6 Mon Sep 17 00:00:00 2001 From: Alan Rominger Date: Thu, 28 Oct 2021 09:43:14 -0400 Subject: [PATCH 40/45] Fix incorrect (changed: True) frequent in OCP task logs --- .../management/commands/register_queue.py | 2 +- .../commands/test_register_queue.py | 26 +++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 awx/main/tests/functional/commands/test_register_queue.py diff --git a/awx/main/management/commands/register_queue.py b/awx/main/management/commands/register_queue.py index 9c05020545..2fa931c88b 100644 --- a/awx/main/management/commands/register_queue.py +++ b/awx/main/management/commands/register_queue.py @@ -36,7 +36,7 @@ class RegisterQueue: ig.policy_instance_minimum = self.instance_min changed = True - if self.is_container_group: + if self.is_container_group and (ig.is_container_group != self.is_container_group): ig.is_container_group = self.is_container_group changed = True diff --git a/awx/main/tests/functional/commands/test_register_queue.py b/awx/main/tests/functional/commands/test_register_queue.py new file mode 100644 index 0000000000..aaa9910911 --- /dev/null +++ b/awx/main/tests/functional/commands/test_register_queue.py @@ -0,0 +1,26 @@ +from io import StringIO +from contextlib import redirect_stdout + +import pytest + +from awx.main.management.commands.register_queue import RegisterQueue +from awx.main.models.ha import InstanceGroup + + +@pytest.mark.django_db +def test_openshift_idempotence(): + def perform_register(): + with StringIO() as buffer: + with redirect_stdout(buffer): + RegisterQueue('default', 100, 0, [], is_container_group=True).register() + return buffer.getvalue() + + assert '(changed: True)' in perform_register() + assert '(changed: True)' not in perform_register() + assert '(changed: True)' not in perform_register() + + ig = InstanceGroup.objects.get(name='default') + assert ig.policy_instance_percentage == 100 + assert ig.policy_instance_minimum == 0 + assert ig.policy_instance_list == [] + assert ig.is_container_group is True From abb1125a2c69117f9cfeca2aa4658c491ae8c756 Mon Sep 17 00:00:00 2001 From: Kersom <9053044+nixocio@users.noreply.github.com> Date: Mon, 1 Nov 2021 15:50:59 -0400 Subject: [PATCH 41/45] Display host name for Associate Modal (#5407) Display host name for Associate Modal See: https://github.com/ansible/awx/issues/11256 --- awx/ui/src/components/AssociateModal/AssociateModal.js | 5 +---- awx/ui/src/screens/InstanceGroup/Instances/InstanceList.js | 4 ++++ 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/awx/ui/src/components/AssociateModal/AssociateModal.js b/awx/ui/src/components/AssociateModal/AssociateModal.js index abfc2293ac..38335020d2 100644 --- a/awx/ui/src/components/AssociateModal/AssociateModal.js +++ b/awx/ui/src/components/AssociateModal/AssociateModal.js @@ -18,10 +18,7 @@ const QS_CONFIG = (order_by = 'name') => function AssociateModal({ header = t`Items`, - columns = [ - { key: 'hostname', name: t`Name` }, - { key: 'node_type', name: t`Node Type` }, - ], + columns = [], title = t`Select Items`, onClose, onAssociate, diff --git a/awx/ui/src/screens/InstanceGroup/Instances/InstanceList.js b/awx/ui/src/screens/InstanceGroup/Instances/InstanceList.js index f0a87fe653..946bc81d2e 100644 --- a/awx/ui/src/screens/InstanceGroup/Instances/InstanceList.js +++ b/awx/ui/src/screens/InstanceGroup/Instances/InstanceList.js @@ -283,6 +283,10 @@ function InstanceList() { title={t`Select Instances`} optionsRequest={readInstancesOptions} displayKey="hostname" + columns={[ + { key: 'hostname', name: t`Name` }, + { key: 'node_type', name: t`Node Type` }, + ]} /> )} {error && ( From 4cfa4eaf8e1ef7c52dc4e94d4144d63377693f72 Mon Sep 17 00:00:00 2001 From: nixocio Date: Fri, 29 Oct 2021 16:14:23 -0400 Subject: [PATCH 42/45] Update validators for Misc Auth Edit * Update SharedFields to use number validator instead of integer * Use number validation for SESSIONS_PER_USER See: https://github.com/ansible/tower/issues/5396 --- awx/ui/src/screens/Setting/shared/SharedFields.js | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/awx/ui/src/screens/Setting/shared/SharedFields.js b/awx/ui/src/screens/Setting/shared/SharedFields.js index d085761a93..d273737f50 100644 --- a/awx/ui/src/screens/Setting/shared/SharedFields.js +++ b/awx/ui/src/screens/Setting/shared/SharedFields.js @@ -22,7 +22,7 @@ import CodeEditor from 'components/CodeEditor'; import { PasswordInput } from 'components/FormField'; import { FormFullWidthLayout } from 'components/FormLayout'; import Popover from 'components/Popover'; -import { combine, integer, minMaxValue, required, url } from 'util/validators'; +import { combine, minMaxValue, required, url, number } from 'util/validators'; import AlertModal from 'components/AlertModal'; import RevertButton from './RevertButton'; @@ -365,12 +365,11 @@ const InputField = ({ name, config, type = 'text', isRequired = false }) => { const validators = [ ...(isRequired ? [required(null)] : []), ...(type === 'url' ? [url()] : []), - ...(type === 'number' - ? [integer(), minMaxValue(min_value, max_value)] - : []), + ...(type === 'number' ? [number(), minMaxValue(min_value, max_value)] : []), ]; const [field, meta] = useField({ name, validate: combine(validators) }); const isValid = !(meta.touched && meta.error); + return config ? ( { validated={isValid ? 'default' : 'error'} > Date: Tue, 2 Nov 2021 12:41:45 -0400 Subject: [PATCH 43/45] Fixed error dropped on floor - save receptor detail when it applies --- awx/main/tasks.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/awx/main/tasks.py b/awx/main/tasks.py index a782a8e04e..658cf19387 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -3151,8 +3151,15 @@ class AWXReceptorJob: try: resultsock = receptor_ctl.get_work_results(self.unit_id, return_sockfile=True) lines = resultsock.readlines() - self.task.instance.result_traceback = b"".join(lines).decode() - self.task.instance.save(update_fields=['result_traceback']) + receptor_output = b"".join(lines).decode() + if receptor_output: + self.task.instance.result_traceback = receptor_output + self.task.instance.save(update_fields=['result_traceback']) + elif detail: + self.task.instance.result_traceback = detail + self.task.instance.save(update_fields=['result_traceback']) + else: + logger.warn(f'No result details or output from {self.task.instance.log_format}, status:\n{unit_status}') except Exception: raise RuntimeError(detail) From 9f8250bd4757f050b35f71c8b3bdbd464f694c1d Mon Sep 17 00:00:00 2001 From: chris meyers Date: Fri, 15 Oct 2021 04:49:28 -0400 Subject: [PATCH 44/45] add events to job lifecycle * Note in the job lifecycle when the controller_node and execution_node are chosen. This event occurs most commonly in the task manager with a couple of exceptions that happen when we dynamically create dependenct jobs on the fly in tasks.py --- awx/main/models/unified_jobs.py | 5 +++++ awx/main/scheduler/task_manager.py | 5 +++++ awx/main/tasks.py | 9 +++++++++ 3 files changed, 19 insertions(+) diff --git a/awx/main/models/unified_jobs.py b/awx/main/models/unified_jobs.py index 2cb2fd28af..5281f25e6f 100644 --- a/awx/main/models/unified_jobs.py +++ b/awx/main/models/unified_jobs.py @@ -1506,6 +1506,11 @@ class UnifiedJob( extra["blocked_by"] = blocked_by_msg else: msg = f"{self._meta.model_name}-{self.id} {state.replace('_', ' ')}" + + if state == "controller_node_chosen": + extra["controller_node"] = self.controller_node or "NOT_SET" + elif state == "execution_node_chosen": + extra["execution_node"] = self.execution_node or "NOT_SET" logger_job_lifecycle.debug(msg, extra=extra) @property diff --git a/awx/main/scheduler/task_manager.py b/awx/main/scheduler/task_manager.py index 2944562723..ff48c5267c 100644 --- a/awx/main/scheduler/task_manager.py +++ b/awx/main/scheduler/task_manager.py @@ -291,6 +291,7 @@ class TaskManager: # act as the controller for k8s API interaction try: task.controller_node = Instance.choose_online_control_plane_node() + task.log_lifecycle("controller_node_chosen") except IndexError: logger.warning("No control plane nodes available to run containerized job {}".format(task.log_format)) return @@ -298,19 +299,23 @@ class TaskManager: # project updates and system jobs don't *actually* run in pods, so # just pick *any* non-containerized host and use it as the execution node task.execution_node = Instance.choose_online_control_plane_node() + task.log_lifecycle("execution_node_chosen") logger.debug('Submitting containerized {} to queue {}.'.format(task.log_format, task.execution_node)) else: task.instance_group = rampart_group task.execution_node = instance.hostname + task.log_lifecycle("execution_node_chosen") if instance.node_type == 'execution': try: task.controller_node = Instance.choose_online_control_plane_node() + task.log_lifecycle("controller_node_chosen") except IndexError: logger.warning("No control plane nodes available to manage {}".format(task.log_format)) return else: # control plane nodes will manage jobs locally for performance and resilience task.controller_node = task.execution_node + task.log_lifecycle("controller_node_chosen") logger.debug('Submitting job {} to queue {} controlled by {}.'.format(task.log_format, task.execution_node, task.controller_node)) with disable_activity_stream(): task.celery_task_id = str(uuid.uuid4()) diff --git a/awx/main/tasks.py b/awx/main/tasks.py index 658cf19387..acc60fbf98 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -1914,6 +1914,7 @@ class RunJob(BaseTask): status='running', instance_group=pu_ig, execution_node=pu_en, + controller_node=pu_en, celery_task_id=job.celery_task_id, ) if branch_override: @@ -1922,6 +1923,8 @@ class RunJob(BaseTask): if 'update_' not in sync_metafields['job_tags']: sync_metafields['scm_revision'] = job_revision local_project_sync = job.project.create_project_update(_eager_fields=sync_metafields) + local_project_sync.log_lifecycle("controller_node_chosen") + local_project_sync.log_lifecycle("execution_node_chosen") create_partition(local_project_sync.event_class._meta.db_table, start=local_project_sync.created) # save the associated job before calling run() so that a # cancel() call on the job can cancel the project update @@ -2214,10 +2217,13 @@ class RunProjectUpdate(BaseTask): status='running', instance_group=instance_group, execution_node=project_update.execution_node, + controller_node=project_update.execution_node, source_project_update=project_update, celery_task_id=project_update.celery_task_id, ) ) + local_inv_update.log_lifecycle("controller_node_chosen") + local_inv_update.log_lifecycle("execution_node_chosen") try: create_partition(local_inv_update.event_class._meta.db_table, start=local_inv_update.created) inv_update_class().run(local_inv_update.id) @@ -2665,10 +2671,13 @@ class RunInventoryUpdate(BaseTask): job_tags=','.join(sync_needs), status='running', execution_node=Instance.objects.me().hostname, + controller_node=Instance.objects.me().hostname, instance_group=inventory_update.instance_group, celery_task_id=inventory_update.celery_task_id, ) ) + local_project_sync.log_lifecycle("controller_node_chosen") + local_project_sync.log_lifecycle("execution_node_chosen") create_partition(local_project_sync.event_class._meta.db_table, start=local_project_sync.created) # associate the inventory update before calling run() so that a # cancel() call on the inventory update can cancel the project update From d0c5c3d3cf7f2290d5741f4eb33db999388399e3 Mon Sep 17 00:00:00 2001 From: chris meyers Date: Mon, 25 Oct 2021 15:31:48 -0400 Subject: [PATCH 45/45] add work_unit_id to job lifecycle --- awx/main/models/unified_jobs.py | 7 ++++++- awx/main/tasks.py | 14 ++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/awx/main/models/unified_jobs.py b/awx/main/models/unified_jobs.py index 5281f25e6f..671daf104d 100644 --- a/awx/main/models/unified_jobs.py +++ b/awx/main/models/unified_jobs.py @@ -1497,7 +1497,12 @@ class UnifiedJob( return False def log_lifecycle(self, state, blocked_by=None): - extra = {'type': self._meta.model_name, 'task_id': self.id, 'state': state} + extra = { + 'type': self._meta.model_name, + 'task_id': self.id, + 'state': state, + 'work_unit_id': self.work_unit_id, + } if self.unified_job_template: extra["template_name"] = self.unified_job_template.name if state == "blocked" and blocked_by: diff --git a/awx/main/tasks.py b/awx/main/tasks.py index acc60fbf98..93ab6ccd81 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -3107,7 +3107,21 @@ class AWXReceptorJob: _kw['tlsclient'] = get_tls_client(use_stream_tls) result = receptor_ctl.submit_work(worktype=self.work_type, payload=sockout.makefile('rb'), params=self.receptor_params, signwork=self.sign_work, **_kw) self.unit_id = result['unitid'] + # Update the job with the work unit in-memory so that the log_lifecycle + # will print out the work unit that is to be associated with the job in the database + # via the update_model() call. + # We want to log the work_unit_id as early as possible. A failure can happen in between + # when we start the job in receptor and when we associate the job <-> work_unit_id. + # In that case, there will be work running in receptor and Controller will not know + # which Job it is associated with. + # We do not programatically handle this case. Ideally, we would handle this with a reaper case. + # The two distinct job lifecycle log events below allow for us to at least detect when this + # edge case occurs. If the lifecycle event work_unit_id_received occurs without the + # work_unit_id_assigned event then this case may have occured. + self.task.instance.work_unit_id = result['unitid'] # Set work_unit_id in-memory only + self.task.instance.log_lifecycle("work_unit_id_received") self.task.update_model(self.task.instance.pk, work_unit_id=result['unitid']) + self.task.instance.log_lifecycle("work_unit_id_assigned") sockin.close() sockout.close()