From 99815f8962bfe88f8a485b16736599445219fe7d Mon Sep 17 00:00:00 2001 From: Elijah DeLee Date: Fri, 26 Aug 2022 11:40:36 -0400 Subject: [PATCH 1/3] calcuate consumed capacity in same way in metrics We should be consistent about this. Also this takes us from doing a as many queries to the UnifiedJob table as we have instances to doing 1 query to the UnifiedJob table (and both do 1 query to Instances table) --- awx/main/analytics/collectors.py | 31 ++++++++++++----------- awx/main/scheduler/task_manager_models.py | 8 +++--- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/awx/main/analytics/collectors.py b/awx/main/analytics/collectors.py index 27a41e9dff..e6ee155a83 100644 --- a/awx/main/analytics/collectors.py +++ b/awx/main/analytics/collectors.py @@ -16,6 +16,7 @@ from awx.conf.license import get_license from awx.main.utils import get_awx_version, camelcase_to_underscore, datetime_hook from awx.main import models from awx.main.analytics import register +from awx.main.scheduler.task_manager_models import TaskManagerInstances """ This module is used to define metrics collected by awx.main.analytics.gather() @@ -235,25 +236,25 @@ def projects_by_scm_type(since, **kwargs): @register('instance_info', '1.2', description=_('Cluster topology and capacity')) def instance_info(since, include_hostnames=False, **kwargs): info = {} - instances = models.Instance.objects.values_list('hostname').values( - 'uuid', 'version', 'capacity', 'cpu', 'memory', 'managed_by_policy', 'hostname', 'enabled' - ) - for instance in instances: - consumed_capacity = sum(x.task_impact for x in models.UnifiedJob.objects.filter(execution_node=instance['hostname'], status__in=('running', 'waiting'))) + # Use same method that the TaskManager does to compute consumed capacity without querying all running jobs for each Instance + active_tasks = models.UnifiedJob.objects.filter(status__in=['running', 'waiting']).only('task_impact', 'controller_node', 'execution_node') + tm_instances = TaskManagerInstances(active_tasks, instance_fields=['uuid', 'version', 'capacity', 'cpu', 'memory', 'managed_by_policy', 'enabled']) + for instance in tm_instances.instance_objects: + consumed_capacity = tm_instances[instance.hostname].consumed_capacity instance_info = { - 'uuid': instance['uuid'], - 'version': instance['version'], - 'capacity': instance['capacity'], - 'cpu': instance['cpu'], - 'memory': instance['memory'], - 'managed_by_policy': instance['managed_by_policy'], - 'enabled': instance['enabled'], + 'uuid': instance.uuid, + 'version': instance.version, + 'capacity': instance.capacity, + 'cpu': instance.cpu, + 'memory': instance.memory, + 'managed_by_policy': instance.managed_by_policy, + 'enabled': instance.enabled, 'consumed_capacity': consumed_capacity, - 'remaining_capacity': instance['capacity'] - consumed_capacity, + 'remaining_capacity': instance.capacity - consumed_capacity, } if include_hostnames is True: - instance_info['hostname'] = instance['hostname'] - info[instance['uuid']] = instance_info + instance_info['hostname'] = instance.hostname + info[instance.uuid] = instance_info return info diff --git a/awx/main/scheduler/task_manager_models.py b/awx/main/scheduler/task_manager_models.py index cade939343..b5071266ec 100644 --- a/awx/main/scheduler/task_manager_models.py +++ b/awx/main/scheduler/task_manager_models.py @@ -34,14 +34,14 @@ class TaskManagerInstance: class TaskManagerInstances: - def __init__(self, active_tasks, instances=None): + def __init__(self, active_tasks, instances=None, instance_fields=['node_type', 'capacity', 'hostname', 'enabled']): self.instances_by_hostname = dict() + self.instance_objects = [] if instances is None: - instances = ( - Instance.objects.filter(hostname__isnull=False, enabled=True).exclude(node_type='hop').only('node_type', 'capacity', 'hostname', 'enabled') - ) + instances = Instance.objects.filter(hostname__isnull=False, enabled=True).exclude(node_type='hop').only(*instance_fields) for instance in instances: self.instances_by_hostname[instance.hostname] = TaskManagerInstance(instance) + self.instance_objects.append(instance) # initialize remaining capacity based on currently waiting and running tasks for task in active_tasks: From 125801ec5b9f24e9f5ab65feed336a960ae4ce7d Mon Sep 17 00:00:00 2001 From: Elijah DeLee Date: Fri, 26 Aug 2022 15:42:40 -0400 Subject: [PATCH 2/3] add panel to grafana dashboard for capacity also reorganize so there are two columns of panels, not just one long skinny set of panels --- tools/grafana/dashboards/demo_dashboard.json | 117 ++++++++++++++++--- 1 file changed, 103 insertions(+), 14 deletions(-) diff --git a/tools/grafana/dashboards/demo_dashboard.json b/tools/grafana/dashboards/demo_dashboard.json index c23a005d16..b909fdd541 100644 --- a/tools/grafana/dashboards/demo_dashboard.json +++ b/tools/grafana/dashboards/demo_dashboard.json @@ -85,10 +85,96 @@ }, "gridPos": { "h": 8, - "w": 12, + "w": 24, "x": 0, "y": 0 }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "awx_status_total", + "refId": "A" + } + ], + "title": "job status", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, "id": 12, "options": { "legend": { @@ -199,7 +285,7 @@ "gridPos": { "h": 8, "w": 12, - "x": 0, + "x": 12, "y": 8 }, "id": 10, @@ -458,8 +544,8 @@ "gridPos": { "h": 8, "w": 12, - "x": 0, - "y": 24 + "x": 12, + "y": 16 }, "id": 18, "options": { @@ -556,9 +642,9 @@ "h": 8, "w": 12, "x": 0, - "y": 32 + "y": 24 }, - "id": 8, + "id": 14, "options": { "legend": { "calcs": [], @@ -576,11 +662,14 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "expr": "awx_status_total", + "editorMode": "builder", + "expr": "awx_database_connections_total", + "legendFormat": "__auto", + "range": true, "refId": "A" } ], - "title": "job status", + "title": "Database", "type": "timeseries" }, { @@ -641,10 +730,10 @@ "gridPos": { "h": 8, "w": 12, - "x": 0, - "y": 40 + "x": 12, + "y": 24 }, - "id": 14, + "id": 20, "options": { "legend": { "calcs": [], @@ -663,13 +752,13 @@ "uid": "PBFA97CFB590B2093" }, "editorMode": "builder", - "expr": "awx_database_connections_total", + "expr": "awx_instance_consumed_capacity", "legendFormat": "__auto", "range": true, "refId": "A" } ], - "title": "Database", + "title": "Consumed Instance Capacity", "type": "timeseries" } ], @@ -688,6 +777,6 @@ "timezone": "", "title": "awx-demo", "uid": "GISWZOXnk", - "version": 2, + "version": 4, "weekStart": "" } From 2437a84b48b42b4c1775081d7a6106e2a3dba2f6 Mon Sep 17 00:00:00 2001 From: Alan Rominger Date: Mon, 29 Aug 2022 14:28:50 -0400 Subject: [PATCH 3/3] Minor changes to instance loop structure --- awx/main/analytics/collectors.py | 8 ++++---- awx/main/scheduler/task_manager_models.py | 4 +--- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/awx/main/analytics/collectors.py b/awx/main/analytics/collectors.py index e6ee155a83..27108e66b3 100644 --- a/awx/main/analytics/collectors.py +++ b/awx/main/analytics/collectors.py @@ -239,8 +239,8 @@ def instance_info(since, include_hostnames=False, **kwargs): # Use same method that the TaskManager does to compute consumed capacity without querying all running jobs for each Instance active_tasks = models.UnifiedJob.objects.filter(status__in=['running', 'waiting']).only('task_impact', 'controller_node', 'execution_node') tm_instances = TaskManagerInstances(active_tasks, instance_fields=['uuid', 'version', 'capacity', 'cpu', 'memory', 'managed_by_policy', 'enabled']) - for instance in tm_instances.instance_objects: - consumed_capacity = tm_instances[instance.hostname].consumed_capacity + for tm_instance in tm_instances.instances_by_hostname.values(): + instance = tm_instance.obj instance_info = { 'uuid': instance.uuid, 'version': instance.version, @@ -249,8 +249,8 @@ def instance_info(since, include_hostnames=False, **kwargs): 'memory': instance.memory, 'managed_by_policy': instance.managed_by_policy, 'enabled': instance.enabled, - 'consumed_capacity': consumed_capacity, - 'remaining_capacity': instance.capacity - consumed_capacity, + 'consumed_capacity': tm_instance.consumed_capacity, + 'remaining_capacity': instance.capacity - tm_instance.consumed_capacity, } if include_hostnames is True: instance_info['hostname'] = instance.hostname diff --git a/awx/main/scheduler/task_manager_models.py b/awx/main/scheduler/task_manager_models.py index b5071266ec..b84cdfcf82 100644 --- a/awx/main/scheduler/task_manager_models.py +++ b/awx/main/scheduler/task_manager_models.py @@ -34,14 +34,12 @@ class TaskManagerInstance: class TaskManagerInstances: - def __init__(self, active_tasks, instances=None, instance_fields=['node_type', 'capacity', 'hostname', 'enabled']): + def __init__(self, active_tasks, instances=None, instance_fields=('node_type', 'capacity', 'hostname', 'enabled')): self.instances_by_hostname = dict() - self.instance_objects = [] if instances is None: instances = Instance.objects.filter(hostname__isnull=False, enabled=True).exclude(node_type='hop').only(*instance_fields) for instance in instances: self.instances_by_hostname[instance.hostname] = TaskManagerInstance(instance) - self.instance_objects.append(instance) # initialize remaining capacity based on currently waiting and running tasks for task in active_tasks: