From bdabe3602931acfcd61f441da823205377c23868 Mon Sep 17 00:00:00 2001 From: Chris Meyers Date: Thu, 15 Oct 2020 15:25:47 -0400 Subject: [PATCH 1/8] reduce parent->child lock contention * We update the parent unified job template to point at new jobs created. We also update a similar foreign key when the job finishes running. This causes lock contention when the job template is allow_simultaneous and there are a lot of jobs from that job template running in parallel. I've seen as bad as 5 minutes waiting for the lock when a job finishes. * This change moves the parent->child update to OUTSIDE of the transaction if the job is allow_simultaneous (inherited from the parent unified job). We sacrafice a bit of correctness for performance. The logic is, if you are launching 1,000 parallel jobs do you really care that the job template contains a pointer to the last one you launched? Probably not. If you do, you can always query jobs related to the job template sorted by created time. --- awx/main/models/unified_jobs.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/awx/main/models/unified_jobs.py b/awx/main/models/unified_jobs.py index 1abbb29fcb..c50c8668d5 100644 --- a/awx/main/models/unified_jobs.py +++ b/awx/main/models/unified_jobs.py @@ -873,7 +873,13 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique # If status changed, update the parent instance. if self.status != status_before: - self._update_parent_instance() + # Update parent outside of the transaction for Job w/ allow_simultaneous=True + # This dodges lock contention at the expense of the foreign key not being + # completely correct. + if getattr(self, 'allow_simultaneous', False): + connection.on_commit(self._update_parent_instance) + else: + self._update_parent_instance() # Done. return result From 11cc6362b5ce63137adeb4b653ec359f3adfc781 Mon Sep 17 00:00:00 2001 From: Chris Meyers Date: Fri, 2 Oct 2020 13:16:21 -0400 Subject: [PATCH 2/8] reduce per-job database query count * Do not query the database for the set of Instance that belong to the group for which we are trying to fit a job on, for each job. * Instead, cache the set of instances per-instance group. --- awx/main/models/ha.py | 10 +++-- awx/main/scheduler/task_manager.py | 53 +++++++++++++++++++++------ awx/main/tests/unit/models/test_ha.py | 35 +++++++----------- 3 files changed, 61 insertions(+), 37 deletions(-) diff --git a/awx/main/models/ha.py b/awx/main/models/ha.py index fc4e9c022e..5071786653 100644 --- a/awx/main/models/ha.py +++ b/awx/main/models/ha.py @@ -261,18 +261,20 @@ class InstanceGroup(HasPolicyEditsMixin, BaseModel, RelatedJobsMixin): app_label = 'main' - def fit_task_to_most_remaining_capacity_instance(self, task): + @staticmethod + def fit_task_to_most_remaining_capacity_instance(task, instances): instance_most_capacity = None - for i in self.instances.filter(capacity__gt=0, enabled=True).order_by('hostname'): + for i in instances: if i.remaining_capacity >= task.task_impact and \ (instance_most_capacity is None or i.remaining_capacity > instance_most_capacity.remaining_capacity): instance_most_capacity = i return instance_most_capacity - def find_largest_idle_instance(self): + @staticmethod + def find_largest_idle_instance(instances): largest_instance = None - for i in self.instances.filter(capacity__gt=0, enabled=True).order_by('hostname'): + for i in instances: if i.jobs_running == 0: if largest_instance is None: largest_instance = i diff --git a/awx/main/scheduler/task_manager.py b/awx/main/scheduler/task_manager.py index 9f4818bd37..861aa0b63f 100644 --- a/awx/main/scheduler/task_manager.py +++ b/awx/main/scheduler/task_manager.py @@ -7,6 +7,7 @@ import logging import uuid import json import random +from types import SimpleNamespace # Django from django.db import transaction, connection @@ -45,6 +46,15 @@ logger = logging.getLogger('awx.main.scheduler') class TaskManager(): def __init__(self): + ''' + Do NOT put database queries or other potentially expensive operations + in the task manager init. The task manager object is created every time a + job is created, transitions state, and every 30 seconds on each tower node. + More often then not, the object is destroyed quickly because the NOOP case is hit. + + The NOOP case is short-circuit logic. If the task manager realizes that another instance + of the task manager is already running, then it short-circuits and decides not to run. + ''' self.graph = dict() # start task limit indicates how many pending jobs can be started on this # .schedule() run. Starting jobs is expensive, and there is code in place to reap @@ -52,10 +62,30 @@ class TaskManager(): # 5 minutes to start pending jobs. If this limit is reached, pending jobs # will no longer be started and will be started on the next task manager cycle. self.start_task_limit = settings.START_TASK_LIMIT + + def after_lock_init(self): + ''' + Init AFTER we know this instance of the task manager will run because the lock is acquired. + ''' + instances = Instance.objects.filter(capacity__gt=0, enabled=True) + self.real_instances = {i.hostname: i for i in instances} + + instances_partial = [SimpleNamespace(obj=instance, + remaining_capacity=instance.remaining_capacity, + capacity=instance.capacity, + jobs_running=instance.jobs_running, + hostname=instance.hostname) for instance in instances] + + instances_by_hostname = {i.hostname: i for i in instances_partial} + for rampart_group in InstanceGroup.objects.prefetch_related('instances'): self.graph[rampart_group.name] = dict(graph=DependencyGraph(rampart_group.name), capacity_total=rampart_group.capacity, - consumed_capacity=0) + consumed_capacity=0, + instances=[]) + for instance in rampart_group.instances.filter(capacity__gt=0, enabled=True).order_by('hostname'): + if instance.hostname in instances_by_hostname: + self.graph[rampart_group.name]['instances'].append(instances_by_hostname[instance.hostname]) def is_job_blocked(self, task): # TODO: I'm not happy with this, I think blocking behavior should be decided outside of the dependency graph @@ -466,7 +496,6 @@ class TaskManager(): continue preferred_instance_groups = task.preferred_instance_groups found_acceptable_queue = False - idle_instance_that_fits = None if isinstance(task, WorkflowJob): if task.unified_job_template_id in running_workflow_templates: if not task.allow_simultaneous: @@ -483,24 +512,23 @@ class TaskManager(): found_acceptable_queue = True break - if idle_instance_that_fits is None: - idle_instance_that_fits = rampart_group.find_largest_idle_instance() remaining_capacity = self.get_remaining_capacity(rampart_group.name) if not rampart_group.is_containerized and self.get_remaining_capacity(rampart_group.name) <= 0: logger.debug("Skipping group {}, remaining_capacity {} <= 0".format( rampart_group.name, remaining_capacity)) continue - execution_instance = rampart_group.fit_task_to_most_remaining_capacity_instance(task) - if execution_instance: - logger.debug("Starting {} in group {} instance {} (remaining_capacity={})".format( - task.log_format, rampart_group.name, execution_instance.hostname, remaining_capacity)) - elif not execution_instance and idle_instance_that_fits: + execution_instance = InstanceGroup.fit_task_to_most_remaining_capacity_instance(task, self.graph[rampart_group.name]['instances']) or \ + InstanceGroup.find_largest_idle_instance(self.graph[rampart_group.name]['instances']) + + if execution_instance or rampart_group.is_containerized: if not rampart_group.is_containerized: - execution_instance = idle_instance_that_fits + execution_instance.remaining_capacity = max(0, execution_instance.remaining_capacity - task.task_impact) + execution_instance.jobs_running += 1 logger.debug("Starting {} in group {} instance {} (remaining_capacity={})".format( task.log_format, rampart_group.name, execution_instance.hostname, remaining_capacity)) - if execution_instance or rampart_group.is_containerized: + + execution_instance = self.real_instances[execution_instance.hostname] self.graph[rampart_group.name]['graph'].add_job(task) self.start_task(task, rampart_group, task.get_jobs_fail_chain(), execution_instance) found_acceptable_queue = True @@ -572,6 +600,9 @@ class TaskManager(): def _schedule(self): finished_wfjs = [] all_sorted_tasks = self.get_tasks() + + self.after_lock_init() + if len(all_sorted_tasks) > 0: # TODO: Deal with # latest_project_updates = self.get_latest_project_update_tasks(all_sorted_tasks) diff --git a/awx/main/tests/unit/models/test_ha.py b/awx/main/tests/unit/models/test_ha.py index 0e29caf8aa..2534acfd15 100644 --- a/awx/main/tests/unit/models/test_ha.py +++ b/awx/main/tests/unit/models/test_ha.py @@ -45,19 +45,14 @@ class TestInstanceGroup(object): (T(100), Is([50, 0, 20, 99, 11, 1, 5, 99]), None, "The task don't a fit, you must a quit!"), ]) def test_fit_task_to_most_remaining_capacity_instance(self, task, instances, instance_fit_index, reason): - with mock.patch.object(InstanceGroup, - 'instances', - Mock(spec_set=['filter'], - filter=lambda *args, **kargs: Mock(spec_set=['order_by'], - order_by=lambda x: instances))): - ig = InstanceGroup(id=10) + ig = InstanceGroup(id=10) - if instance_fit_index is None: - assert ig.fit_task_to_most_remaining_capacity_instance(task) is None, reason - else: - assert ig.fit_task_to_most_remaining_capacity_instance(task) == \ - instances[instance_fit_index], reason + instance_picked = ig.fit_task_to_most_remaining_capacity_instance(task, instances) + if instance_fit_index is None: + assert instance_picked is None, reason + else: + assert instance_picked == instances[instance_fit_index], reason @pytest.mark.parametrize('instances,instance_fit_index,reason', [ (Is([(0, 100)]), 0, "One idle instance, pick it"), @@ -70,16 +65,12 @@ class TestInstanceGroup(object): def filter_offline_instances(*args): return filter(lambda i: i.capacity > 0, instances) - with mock.patch.object(InstanceGroup, - 'instances', - Mock(spec_set=['filter'], - filter=lambda *args, **kargs: Mock(spec_set=['order_by'], - order_by=filter_offline_instances))): - ig = InstanceGroup(id=10) + ig = InstanceGroup(id=10) + instances_online_only = filter_offline_instances(instances) - if instance_fit_index is None: - assert ig.find_largest_idle_instance() is None, reason - else: - assert ig.find_largest_idle_instance() == \ - instances[instance_fit_index], reason + if instance_fit_index is None: + assert ig.find_largest_idle_instance(instances_online_only) is None, reason + else: + assert ig.find_largest_idle_instance(instances_online_only) == \ + instances[instance_fit_index], reason From 84cb7be07914cd6b5f939459ffcddc02bbeef341 Mon Sep 17 00:00:00 2001 From: Chris Meyers Date: Mon, 19 Oct 2020 10:24:50 -0400 Subject: [PATCH 3/8] fill in postgres application_name on connection * Tried to fill in application_name in awx/__init__.py but I think that is too late * Fill in database application_name with enough information to easily trace the connection from postgres back to the node and pid that initiated the connection. * Set application_name in django settings so that application_name is set _before_ the first postgres connection is established. --- awx/main/management/commands/check_migrations.py | 2 ++ awx/settings/development.py | 3 +++ awx/settings/production.py | 3 +++ 3 files changed, 8 insertions(+) diff --git a/awx/main/management/commands/check_migrations.py b/awx/main/management/commands/check_migrations.py index 50ea354960..6f9cfc7727 100644 --- a/awx/main/management/commands/check_migrations.py +++ b/awx/main/management/commands/check_migrations.py @@ -8,5 +8,7 @@ class Command(MakeMigrations): def execute(self, *args, **options): settings = connections['default'].settings_dict.copy() settings['ENGINE'] = 'sqlite3' + if 'application_name' in settings['OPTIONS']: + del settings['OPTIONS']['application_name'] connections['default'] = DatabaseWrapper(settings) return MakeMigrations().execute(*args, **options) diff --git a/awx/settings/development.py b/awx/settings/development.py index 3a4e008488..108767b98c 100644 --- a/awx/settings/development.py +++ b/awx/settings/development.py @@ -184,3 +184,6 @@ else: pass AWX_CALLBACK_PROFILE = True + +if 'sqlite3' not in DATABASES['default']['ENGINE']: # noqa + DATABASES['default'].setdefault('OPTIONS', dict()).setdefault('application_name', f'{CLUSTER_HOST_ID}-{os.getpid()}-{" ".join(sys.argv)}'[:63]) # noqa diff --git a/awx/settings/production.py b/awx/settings/production.py index c2cde28c0f..fb24b7087f 100644 --- a/awx/settings/production.py +++ b/awx/settings/production.py @@ -102,6 +102,7 @@ except IOError: else: raise +# The below runs AFTER all of the custom settings are imported. CELERYBEAT_SCHEDULE.update({ # noqa 'isolated_heartbeat': { @@ -110,3 +111,5 @@ CELERYBEAT_SCHEDULE.update({ # noqa 'options': {'expires': AWX_ISOLATED_PERIODIC_CHECK * 2}, # noqa } }) + +DATABASES['default'].setdefault('OPTIONS', dict()).setdefault('application_name', f'{CLUSTER_HOST_ID}-{os.getpid()}-{" ".join(sys.argv)}'[:63]) # noqa From 7a9b55c21b52b33677bbbabaa725a10e9053c884 Mon Sep 17 00:00:00 2001 From: mabashian Date: Tue, 20 Oct 2020 10:31:30 -0400 Subject: [PATCH 4/8] Replace SETTINGS > SYSTEM with SETTINGS > SYSTEM to get around issue with translating this string --- awx/ui/client/src/license/license.partial.html | 2 +- awx/ui/po/ansible-tower-ui.pot | 2 +- awx/ui/po/es.po | 4 ++-- awx/ui/po/fr.po | 4 ++-- awx/ui/po/ja.po | 4 ++-- awx/ui/po/nl.po | 4 ++-- awx/ui/po/zh.po | 4 ++-- 7 files changed, 12 insertions(+), 12 deletions(-) diff --git a/awx/ui/client/src/license/license.partial.html b/awx/ui/client/src/license/license.partial.html index 9a4a9a80f7..2dc7beeb9c 100644 --- a/awx/ui/client/src/license/license.partial.html +++ b/awx/ui/client/src/license/license.partial.html @@ -125,7 +125,7 @@
- Provide your Red Hat customer credentials and you can choose from a list of your available licenses. The credentials you use will be stored for future use in retrieving renewal or expanded licenses. You can update or remove them in SETTINGS > SYSTEM. + Provide your Red Hat customer credentials and you can choose from a list of your available licenses. The credentials you use will be stored for future use in retrieving renewal or expanded licenses. You can update or remove them in SETTINGS > SYSTEM.
diff --git a/awx/ui/po/ansible-tower-ui.pot b/awx/ui/po/ansible-tower-ui.pot index f3fb4fe286..5da8733512 100644 --- a/awx/ui/po/ansible-tower-ui.pot +++ b/awx/ui/po/ansible-tower-ui.pot @@ -5070,7 +5070,7 @@ msgid "Provide environment variables to pass to the custom inventory script." msgstr "" #: client/src/license/license.partial.html:128 -msgid "Provide your Red Hat customer credentials and you can choose from a list of your available licenses. The credentials you use will be stored for future use in retrieving renewal or expanded licenses. You can update or remove them in SETTINGS > SYSTEM." +msgid "Provide your Red Hat customer credentials and you can choose from a list of your available licenses. The credentials you use will be stored for future use in retrieving renewal or expanded licenses. You can update or remove them in SETTINGS > SYSTEM." msgstr "" #: client/src/templates/job_templates/job-template.form.js:374 diff --git a/awx/ui/po/es.po b/awx/ui/po/es.po index c70c66ee02..fdc1229418 100644 --- a/awx/ui/po/es.po +++ b/awx/ui/po/es.po @@ -5179,8 +5179,8 @@ msgid "Provide the named URL encoded name or id of the remote Tower inventory to msgstr "Indique la URL, el nombre cifrado o id del inventario remoto de Tower para importarlos." #: client/src/license/license.partial.html:128 -msgid "Provide your Red Hat customer credentials and you can choose from a list of your available licenses. The credentials you use will be stored for future use in retrieving renewal or expanded licenses. You can update or remove them in SETTINGS > SYSTEM." -msgstr "Proporcione sus credenciales de cliente de Red Hat y podrá elegir de una lista de sus licencias disponibles. Las credenciales que utilice se almacenarán para su uso futuro en la recuperación de las licencias de renovación o ampliadas. Puede actualizarlas o eliminarlas en CONFIGURACIÓN > SISTEMA." +msgid "Provide your Red Hat customer credentials and you can choose from a list of your available licenses. The credentials you use will be stored for future use in retrieving renewal or expanded licenses. You can update or remove them in SETTINGS > SYSTEM." +msgstr "Proporcione sus credenciales de cliente de Red Hat y podrá elegir de una lista de sus licencias disponibles. Las credenciales que utilice se almacenarán para su uso futuro en la recuperación de las licencias de renovación o ampliadas. Puede actualizarlas o eliminarlas en CONFIGURACIÓN > SISTEMA." #: client/src/templates/job_templates/job-template.form.js:374 #: client/src/templates/job_templates/job-template.form.js:382 diff --git a/awx/ui/po/fr.po b/awx/ui/po/fr.po index 59ed681256..513c1718aa 100644 --- a/awx/ui/po/fr.po +++ b/awx/ui/po/fr.po @@ -5185,8 +5185,8 @@ msgid "Provide the named URL encoded name or id of the remote Tower inventory to msgstr "Fournir le nom encodé de l'URL ou d'id de l'inventaire distant de Tower à importer." #: client/src/license/license.partial.html:128 -msgid "Provide your Red Hat customer credentials and you can choose from a list of your available licenses. The credentials you use will be stored for future use in retrieving renewal or expanded licenses. You can update or remove them in SETTINGS > SYSTEM." -msgstr "Fournissez vos informations d’identification client Red Hat et choisissez parmi une liste de licences disponibles pour vous. Les informations d'identification que vous utilisez seront stockées pour une utilisation ultérieure lors de la récupération des licences renouvelées ou étendues. Vous pouvez les mettre à jour ou les supprimer dans PARAMÈTRES > SYSTÈME." +msgid "Provide your Red Hat customer credentials and you can choose from a list of your available licenses. The credentials you use will be stored for future use in retrieving renewal or expanded licenses. You can update or remove them in SETTINGS > SYSTEM." +msgstr "Fournissez vos informations d’identification client Red Hat et choisissez parmi une liste de licences disponibles pour vous. Les informations d'identification que vous utilisez seront stockées pour une utilisation ultérieure lors de la récupération des licences renouvelées ou étendues. Vous pouvez les mettre à jour ou les supprimer dans PARAMÈTRES > SYSTÈME." #: client/src/templates/job_templates/job-template.form.js:374 #: client/src/templates/job_templates/job-template.form.js:382 diff --git a/awx/ui/po/ja.po b/awx/ui/po/ja.po index 617c5c66ca..9c04b6160a 100644 --- a/awx/ui/po/ja.po +++ b/awx/ui/po/ja.po @@ -5102,8 +5102,8 @@ msgid "Provide environment variables to pass to the custom inventory script." msgstr "カスタムインベントリースクリプトに渡す環境変数を指定します。" #: client/src/license/license.partial.html:128 -msgid "Provide your Red Hat customer credentials and you can choose from a list of your available licenses. The credentials you use will be stored for future use in retrieving renewal or expanded licenses. You can update or remove them in SETTINGS > SYSTEM." -msgstr "Red Hat の顧客認証情報を指定して、利用可能なライセンス一覧から選択してください。使用した認証情報は、今後、ライセンスの更新や延長情報を取得する時に利用できるように保存されます。設定 > システムでこの情報は更新または削除できます。" +msgid "Provide your Red Hat customer credentials and you can choose from a list of your available licenses. The credentials you use will be stored for future use in retrieving renewal or expanded licenses. You can update or remove them in SETTINGS > SYSTEM." +msgstr "Red Hat の顧客認証情報を指定して、利用可能なライセンス一覧から選択してください。使用した認証情報は、今後、ライセンスの更新や延長情報を取得する時に利用できるように保存されます。設定 > システムでこの情報は更新または削除できます。" #: client/src/templates/job_templates/job-template.form.js:374 #: client/src/templates/job_templates/job-template.form.js:382 diff --git a/awx/ui/po/nl.po b/awx/ui/po/nl.po index 721a28368c..c311590856 100644 --- a/awx/ui/po/nl.po +++ b/awx/ui/po/nl.po @@ -5183,8 +5183,8 @@ msgid "Provide the named URL encoded name or id of the remote Tower inventory to msgstr "Voer de URL, versleutelde naam of ID of de externe inventaris in die geïmporteerd moet worden." #: client/src/license/license.partial.html:128 -msgid "Provide your Red Hat customer credentials and you can choose from a list of your available licenses. The credentials you use will be stored for future use in retrieving renewal or expanded licenses. You can update or remove them in SETTINGS > SYSTEM." -msgstr "Geef uw Red Hat-klantengegevens door en u kunt kiezen uit een lijst met beschikbare licenties. De toegangsgegevens die u gebruikt, worden opgeslagen voor toekomstig gebruik bij het ophalen van verlengingen of uitbreidingen van licenties. U kunt deze updaten of verwijderen in INSTELLINGEN > SYSTEEM." +msgid "Provide your Red Hat customer credentials and you can choose from a list of your available licenses. The credentials you use will be stored for future use in retrieving renewal or expanded licenses. You can update or remove them in SETTINGS > SYSTEM." +msgstr "Geef uw Red Hat-klantengegevens door en u kunt kiezen uit een lijst met beschikbare licenties. De toegangsgegevens die u gebruikt, worden opgeslagen voor toekomstig gebruik bij het ophalen van verlengingen of uitbreidingen van licenties. U kunt deze updaten of verwijderen in INSTELLINGEN > SYSTEEM." #: client/src/templates/job_templates/job-template.form.js:374 #: client/src/templates/job_templates/job-template.form.js:382 diff --git a/awx/ui/po/zh.po b/awx/ui/po/zh.po index 92348c9c6e..957b5132f1 100644 --- a/awx/ui/po/zh.po +++ b/awx/ui/po/zh.po @@ -5183,8 +5183,8 @@ msgid "Provide the named URL encoded name or id of the remote Tower inventory to msgstr "提供要导入的远程 Tower 清单的命名 URL 编码名称或 ID。" #: client/src/license/license.partial.html:128 -msgid "Provide your Red Hat customer credentials and you can choose from a list of your available licenses. The credentials you use will be stored for future use in retrieving renewal or expanded licenses. You can update or remove them in SETTINGS > SYSTEM." -msgstr "提供您的红帽客户凭证,您可以从可用许可证列表中进行选择。您使用的凭证将存储以供将来用于检索续订或扩展许可证。您可以在“设置”>“系统”中更新或删除它们。" +msgid "Provide your Red Hat customer credentials and you can choose from a list of your available licenses. The credentials you use will be stored for future use in retrieving renewal or expanded licenses. You can update or remove them in SETTINGS > SYSTEM." +msgstr "提供您的红帽客户凭证,您可以从可用许可证列表中进行选择。您使用的凭证将存储以供将来用于检索续订或扩展许可证。您可以在“设置”>“系统”中更新或删除它们。" #: client/src/templates/job_templates/job-template.form.js:374 #: client/src/templates/job_templates/job-template.form.js:382 From 7bc7cb00acffe7729f0bebad799e79f15b4cb51e Mon Sep 17 00:00:00 2001 From: Jake McDermott Date: Thu, 15 Oct 2020 09:54:40 -0400 Subject: [PATCH 5/8] Use current year in about modal --- awx/ui/client/src/about/about.controller.js | 1 + awx/ui/client/src/about/about.partial.html | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/awx/ui/client/src/about/about.controller.js b/awx/ui/client/src/about/about.controller.js index 868adf01dc..e2ef6e776a 100644 --- a/awx/ui/client/src/about/about.controller.js +++ b/awx/ui/client/src/about/about.controller.js @@ -7,6 +7,7 @@ export default ['$rootScope', '$scope', '$location', 'ConfigService', 'lastPath' $scope.ansible_version = config.ansible_version; $scope.subscription = config.license_info.subscription_name; $scope.speechBubble = createSpeechBubble($rootScope.BRAND_NAME, config.version); + $scope.currentYear = new Date().getFullYear(); $('#about-modal').modal('show'); }); diff --git a/awx/ui/client/src/about/about.partial.html b/awx/ui/client/src/about/about.partial.html index a5faa70e84..2be5f6234b 100644 --- a/awx/ui/client/src/about/about.partial.html +++ b/awx/ui/client/src/about/about.partial.html @@ -29,7 +29,7 @@ Ansible {{ ansible_version }}
- Copyright © 2019 Red Hat, Inc.
+ Copyright © {{ currentYear }} Red Hat, Inc.
Visit Ansible.com for more information.

From c373d5307f8c8df29a049bccf50172cdf9e7f8c2 Mon Sep 17 00:00:00 2001 From: Ryan Petrello Date: Wed, 21 Oct 2020 09:20:26 -0400 Subject: [PATCH 6/8] allow the CLI to associate Galaxy credentials to Organizations $ awx organizations associate Default --galaxy_credential "Ansible Galaxy" --- awxkit/awxkit/cli/custom.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/awxkit/awxkit/cli/custom.py b/awxkit/awxkit/cli/custom.py index fb45043632..38d97d8895 100644 --- a/awxkit/awxkit/cli/custom.py +++ b/awxkit/awxkit/cli/custom.py @@ -246,6 +246,7 @@ class AssociationMixin(object): 'success_notification': 'notification_templates', 'failure_notification': 'notification_templates', 'credential': 'credentials', + 'galaxy_credential': 'credentials', }[resource] def get(self, **kwargs): @@ -367,9 +368,11 @@ class OrganizationNotificationDisAssociation(NotificationAssociateMixin, CustomA OrganizationNotificationAssociation.targets.update({ 'approval_notification': ['notification_templates_approvals', 'notification_template'], + 'galaxy_credential': ['galaxy_credentials', 'credential'], }) OrganizationNotificationDisAssociation.targets.update({ 'approval_notification': ['notification_templates_approvals', 'notification_template'], + 'galaxy_credential': ['galaxy_credentials', 'credential'], }) From ce052922c6f1f343b87a5d4194f743ccc441cc5f Mon Sep 17 00:00:00 2001 From: Chris Meyers Date: Mon, 19 Oct 2020 16:23:57 -0400 Subject: [PATCH 7/8] terminal graph of job status changes * Visualize how jobs go from pending, waiting, running over time --- awx/main/management/commands/graph_jobs.py | 117 +++++++++++++++++++++ docs/debugging.md | 26 +++++ docs/licenses/asciichartpy.txt | 21 ++++ requirements/requirements.in | 1 + requirements/requirements.txt | 3 +- 5 files changed, 167 insertions(+), 1 deletion(-) create mode 100644 awx/main/management/commands/graph_jobs.py create mode 100644 docs/licenses/asciichartpy.txt diff --git a/awx/main/management/commands/graph_jobs.py b/awx/main/management/commands/graph_jobs.py new file mode 100644 index 0000000000..f1c8ad75e1 --- /dev/null +++ b/awx/main/management/commands/graph_jobs.py @@ -0,0 +1,117 @@ +# Python +import asciichartpy as chart +import collections +import time +import sys + +# Django +from django.db.models import Count +from django.core.management.base import BaseCommand + +# AWX +from awx.main.models import ( + Job, + Instance +) + + +DEFAULT_WIDTH = 100 +DEFAULT_HEIGHT = 30 + + +def chart_color_lookup(color_str): + return getattr(chart, color_str) + + +def clear_screen(): + print(chr(27) + "[2J") + + +class JobStatus(): + def __init__(self, status, color, width): + self.status = status + self.color = color + self.color_code = chart_color_lookup(color) + self.x = collections.deque(maxlen=width) + self.y = collections.deque(maxlen=width) + + def tick(self, x, y): + self.x.append(x) + self.y.append(y) + + +class JobStatusController: + RESET = chart_color_lookup('reset') + + def __init__(self, width): + self.plots = [ + JobStatus('pending', 'red', width), + JobStatus('waiting', 'blue', width), + JobStatus('running', 'green', width) + ] + self.ts_start = int(time.time()) + + def tick(self): + ts = int(time.time()) - self.ts_start + q = Job.objects.filter(status__in=['pending','waiting','running']).values_list('status').order_by().annotate(Count('status')) + status_count = dict(pending=0, waiting=0, running=0) + for status, count in q: + status_count[status] = count + + for p in self.plots: + p.tick(ts, status_count[p.status]) + + def series(self): + return [list(p.y) for p in self.plots] + + def generate_status(self): + line = "" + lines = [] + for p in self.plots: + lines.append(f'{p.color_code}{p.status} {p.y[-1]}{self.RESET}') + + line += ", ".join(lines) + '\n' + + width = 5 + time_running = int(time.time()) - self.ts_start + instances = Instance.objects.all().order_by('hostname') + line += "Capacity: " + ", ".join([f"{instance.capacity:{width}}" for instance in instances]) + '\n' + line += "Remaining: " + ", ".join([f"{instance.remaining_capacity:{width}}" for instance in instances]) + '\n' + line += f"Seconds running: {time_running}" + '\n' + + return line + + +class Command(BaseCommand): + help = "Plot pending, waiting, running jobs over time on the terminal" + + def add_arguments(self, parser): + parser.add_argument('--refresh', dest='refresh', type=float, default=1.0, + help='Time between refreshes of the graph and data in seconds (defaults to 1.0)') + parser.add_argument('--width', dest='width', type=int, default=DEFAULT_WIDTH, + help=f'Width of the graph (defaults to {DEFAULT_WIDTH})') + parser.add_argument('--height', dest='height', type=int, default=DEFAULT_HEIGHT, + help=f'Height of the graph (defaults to {DEFAULT_HEIGHT})') + + def handle(self, *args, **options): + refresh_seconds = options['refresh'] + width = options['width'] + height = options['height'] + + jctl = JobStatusController(width) + + conf = { + 'colors': [chart_color_lookup(p.color) for p in jctl.plots], + 'height': height, + } + + while True: + jctl.tick() + + draw = chart.plot(jctl.series(), conf) + status_line = jctl.generate_status() + clear_screen() + print(draw) + sys.stdout.write(status_line) + time.sleep(refresh_seconds) + diff --git a/docs/debugging.md b/docs/debugging.md index 62a59d5317..1452c3c4bb 100644 --- a/docs/debugging.md +++ b/docs/debugging.md @@ -98,3 +98,29 @@ sdb-listen This will open a Python process that listens for new debugger sessions and automatically connects to them for you. + +Graph Jobs +---------- +The `awx-manage graph_jobs` can be used to visualize how Jobs progress from +pending to waiting to running. + +``` +awx-manage graph_jobs --help +usage: awx-manage graph_jobs [-h] [--refresh REFRESH] [--width WIDTH] + [--height HEIGHT] [--version] [-v {0,1,2,3}] + [--settings SETTINGS] [--pythonpath PYTHONPATH] + [--traceback] [--no-color] [--force-color] + +Plot pending, waiting, running jobs over time on the terminal + +optional arguments: + -h, --help show this help message and exit + --refresh REFRESH Time between refreshes of the graph and data in + seconds (defaults to 1.0) + --width WIDTH Width of the graph (defaults to 100) + --height HEIGHT Height of the graph (defaults to 30) +``` + +Below is an example run with 200 Jobs flowing through the system. + +[![asciicast](https://asciinema.org/a/xnfzMQ30xWPdhwORiISz0wcEw.svg)](https://asciinema.org/a/xnfzMQ30xWPdhwORiISz0wcEw) diff --git a/docs/licenses/asciichartpy.txt b/docs/licenses/asciichartpy.txt new file mode 100644 index 0000000000..808639aa5b --- /dev/null +++ b/docs/licenses/asciichartpy.txt @@ -0,0 +1,21 @@ +MIT License + +Copyright © 2016 Igor Kroitor + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/requirements/requirements.in b/requirements/requirements.in index 7848e5833b..8b8de10272 100644 --- a/requirements/requirements.in +++ b/requirements/requirements.in @@ -1,6 +1,7 @@ aiohttp ansible-runner>=1.4.6 ansiconv==1.0.0 # UPGRADE BLOCKER: from 2013, consider replacing instead of upgrading +asciichartpy azure-keyvault==1.1.0 # see UPGRADE BLOCKERs channels channels-redis>=3.1.0 # https://github.com/django/channels_redis/issues/212 diff --git a/requirements/requirements.txt b/requirements/requirements.txt index c1c4c49d14..dac5c5267d 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -3,6 +3,7 @@ aiohttp==3.6.2 # via -r /awx_devel/requirements/requirements.in aioredis==1.3.1 # via channels-redis ansible-runner==1.4.6 # via -r /awx_devel/requirements/requirements.in ansiconv==1.0.0 # via -r /awx_devel/requirements/requirements.in +asciichartpy==1.5.25 # via -r /awx_devel/requirements/requirements.in asgiref==3.2.5 # via channels, channels-redis, daphne async-timeout==3.0.1 # via aiohttp, aioredis attrs==19.3.0 # via aiohttp, automat, jsonschema, service-identity, twisted @@ -130,4 +131,4 @@ zope.interface==5.0.0 # via twisted # The following packages are considered to be unsafe in a requirements file: pip==19.3.1 # via -r /awx_devel/requirements/requirements.in -setuptools==41.6.0 # via -r /awx_devel/requirements/requirements.in, google-auth, jsonschema, kubernetes, markdown, python-daemon, zope.interface +setuptools==41.6.0 # via -r /awx_devel/requirements/requirements.in, asciichartpy, google-auth, jsonschema, kubernetes, markdown, python-daemon, zope.interface From 79d7c6d9b3c80aba3f9e35a8dc076355223f9c2c Mon Sep 17 00:00:00 2001 From: Chris Meyers Date: Wed, 21 Oct 2020 14:17:04 -0400 Subject: [PATCH 8/8] make optimization code work with container groups * Task manager fit_ optimization code caused problems with container group code. * Note that we don't actually get the benefit of the optimization for container groups. We just make it so that the code doesn't blow up. It will take another pass to apply optimizations to the container group task manager path. --- awx/main/scheduler/task_manager.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/awx/main/scheduler/task_manager.py b/awx/main/scheduler/task_manager.py index 861aa0b63f..43d43fe64d 100644 --- a/awx/main/scheduler/task_manager.py +++ b/awx/main/scheduler/task_manager.py @@ -14,6 +14,7 @@ from django.db import transaction, connection from django.utils.translation import ugettext_lazy as _, gettext_noop from django.utils.timezone import now as tz_now from django.conf import settings +from django.db.models import Q # AWX from awx.main.dispatch.reaper import reap_job @@ -67,7 +68,7 @@ class TaskManager(): ''' Init AFTER we know this instance of the task manager will run because the lock is acquired. ''' - instances = Instance.objects.filter(capacity__gt=0, enabled=True) + instances = Instance.objects.filter(~Q(hostname=None), capacity__gt=0, enabled=True) self.real_instances = {i.hostname: i for i in instances} instances_partial = [SimpleNamespace(obj=instance, @@ -284,7 +285,7 @@ class TaskManager(): for group in InstanceGroup.objects.all(): if group.is_containerized or group.controller_id: continue - match = group.fit_task_to_most_remaining_capacity_instance(task) + match = group.fit_task_to_most_remaining_capacity_instance(task, group.instances.all()) if match: break task.instance_group = rampart_group @@ -528,7 +529,8 @@ class TaskManager(): logger.debug("Starting {} in group {} instance {} (remaining_capacity={})".format( task.log_format, rampart_group.name, execution_instance.hostname, remaining_capacity)) - execution_instance = self.real_instances[execution_instance.hostname] + if execution_instance: + execution_instance = self.real_instances[execution_instance.hostname] self.graph[rampart_group.name]['graph'].add_job(task) self.start_task(task, rampart_group, task.get_jobs_fail_chain(), execution_instance) found_acceptable_queue = True