diff --git a/awx/main/analytics/subsystem_metrics.py b/awx/main/analytics/subsystem_metrics.py index 4e10ff98b8..9b93b98bda 100644 --- a/awx/main/analytics/subsystem_metrics.py +++ b/awx/main/analytics/subsystem_metrics.py @@ -209,6 +209,11 @@ class Metrics: SetFloatM('workflow_manager_recorded_timestamp', 'Unix timestamp when metrics were last recorded'), SetFloatM('workflow_manager_spawn_workflow_graph_jobs_seconds', 'Time spent spawning workflow tasks'), SetFloatM('workflow_manager_get_tasks_seconds', 'Time spent loading workflow tasks from db'), + # dispatcher subsystem metrics + SetIntM('dispatcher_pool_scale_up_events', 'Number of times local dispatcher scaled up a worker since startup'), + SetIntM('dispatcher_pool_active_task_count', 'Number of active tasks in the worker pool when last task was submitted'), + SetIntM('dispatcher_pool_max_worker_count', 'Highest number of workers in worker pool in last collection interval, about 20s'), + SetFloatM('dispatcher_availability', 'Fraction of time (in last collection interval) dispatcher was able to receive messages'), ] # turn metric list into dictionary with the metric name as a key self.METRICS = {} diff --git a/awx/main/dispatch/pool.py b/awx/main/dispatch/pool.py index dd2fdffa2a..b8208012b6 100644 --- a/awx/main/dispatch/pool.py +++ b/awx/main/dispatch/pool.py @@ -339,6 +339,17 @@ class AutoscalePool(WorkerPool): # but if the task takes longer than the time defined here, we will force it to stop here self.task_manager_timeout = settings.TASK_MANAGER_TIMEOUT + settings.TASK_MANAGER_TIMEOUT_GRACE_PERIOD + # initialize some things for subsystem metrics periodic gathering + # the AutoscalePool class does not save these to redis directly, but reports via produce_subsystem_metrics + self.scale_up_ct = 0 + self.worker_count_max = 0 + + def produce_subsystem_metrics(self, metrics_object): + metrics_object.set('dispatcher_pool_scale_up_events', self.scale_up_ct) + metrics_object.set('dispatcher_pool_active_task_count', sum(len(w.managed_tasks) for w in self.workers)) + metrics_object.set('dispatcher_pool_max_worker_count', self.worker_count_max) + self.worker_count_max = len(self.workers) + @property def should_grow(self): if len(self.workers) < self.min_workers: @@ -443,7 +454,12 @@ class AutoscalePool(WorkerPool): idx = random.choice(range(len(self.workers))) return idx, self.workers[idx] else: - return super(AutoscalePool, self).up() + self.scale_up_ct += 1 + ret = super(AutoscalePool, self).up() + new_worker_ct = len(self.workers) + if new_worker_ct > self.worker_count_max: + self.worker_count_max = new_worker_ct + return ret def write(self, preferred_queue, body): if 'guid' in body: diff --git a/awx/main/dispatch/worker/base.py b/awx/main/dispatch/worker/base.py index 9a9d4c803c..c10564f6dd 100644 --- a/awx/main/dispatch/worker/base.py +++ b/awx/main/dispatch/worker/base.py @@ -19,6 +19,7 @@ from awx.main.dispatch.pool import WorkerPool from awx.main.dispatch import pg_bus_conn from awx.main.utils.common import log_excess_runtime from awx.main.utils.db import set_connection_name +import awx.main.analytics.subsystem_metrics as s_metrics if 'run_callback_receiver' in sys.argv: logger = logging.getLogger('awx.main.commands.run_callback_receiver') @@ -154,17 +155,30 @@ class AWXConsumerPG(AWXConsumerBase): self.pg_max_wait = settings.DISPATCHER_DB_DOWNTOWN_TOLLERANCE # if no successful loops have ran since startup, then we should fail right away self.pg_is_down = True # set so that we fail if we get database errors on startup - self.pg_down_time = time.time() - self.pg_max_wait # allow no grace period - self.last_cleanup = time.time() + init_time = time.time() + self.pg_down_time = init_time - self.pg_max_wait # allow no grace period + self.last_cleanup = init_time + self.subsystem_metrics = s_metrics.Metrics(auto_pipe_execute=False) + self.last_metrics_gather = init_time + self.listen_cumulative_time = 0.0 def run_periodic_tasks(self): self.record_statistics() # maintains time buffer in method - if time.time() - self.last_cleanup > 60: # same as cluster_node_heartbeat + current_time = time.time() + if current_time - self.last_cleanup > 60: # same as cluster_node_heartbeat # NOTE: if we run out of database connections, it is important to still run cleanup # so that we scale down workers and free up connections self.pool.cleanup() - self.last_cleanup = time.time() + self.last_cleanup = current_time + + # record subsystem metrics for the dispatcher + if current_time - self.last_metrics_gather > 20: + self.pool.produce_subsystem_metrics(self.subsystem_metrics) + self.subsystem_metrics.set('dispatcher_availability', self.listen_cumulative_time / (current_time - self.last_metrics_gather)) + self.subsystem_metrics.pipe_execute() + self.listen_cumulative_time = 0.0 + self.last_metrics_gather = current_time def run(self, *args, **kwargs): super(AWXConsumerPG, self).run(*args, **kwargs) @@ -180,11 +194,14 @@ class AWXConsumerPG(AWXConsumerBase): if init is False: self.worker.on_start() init = True + self.listen_start = time.time() for e in conn.events(yield_timeouts=True): + self.listen_cumulative_time += time.time() - self.listen_start if e is not None: self.process_task(json.loads(e.payload)) self.run_periodic_tasks() self.pg_is_down = False + self.listen_start = time.time() if self.should_stop: return except psycopg2.InterfaceError: diff --git a/tools/grafana/dashboards/demo_dashboard.json b/tools/grafana/dashboards/demo_dashboard.json index f654bc8e6f..7c0eacc42b 100644 --- a/tools/grafana/dashboards/demo_dashboard.json +++ b/tools/grafana/dashboards/demo_dashboard.json @@ -29,244 +29,308 @@ "liveNow": false, "panels": [ { - "collapsed": false, + "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, - "id": 37, - "panels": [], - "title": "System", - "type": "row" - }, - { - "datasource": { - "type": "prometheus", - "uid": "awx_prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 1 - }, - "id": 14, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ + "id": 38, + "panels": [ { "datasource": { "type": "prometheus", "uid": "awx_prometheus" }, - "editorMode": "builder", - "expr": "awx_database_connections_total", - "legendFormat": "__auto", - "range": true, - "refId": "A" - } - ], - "title": "Database", - "type": "timeseries" - }, - { - "datasource": {}, - "fieldConfig": { - "defaults": { - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" + "description": "Fraction of time dispatcher is listening for new messages", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" } }, - "type": "special" + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 39, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "awx_prometheus" + }, + "editorMode": "builder", + "expr": "dispatcher_availability", + "legendFormat": "__auto", + "range": true, + "refId": "A" } ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "light-blue", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" + "title": "Dispatcher Availability", + "type": "timeseries" }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 5, - "x": 12, - "y": 1 - }, - "id": 25, - "links": [], - "maxDataPoints": 100, - "options": { - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "/^tower_version$/", - "values": false - }, - "textMode": "auto" - }, - "pluginVersion": "9.1.6", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "000000021" - }, - "editorMode": "code", - "exemplar": false, - "expr": "awx_system_info", - "format": "table", - "instant": true, - "interval": "", - "legendFormat": "", - "refId": "A" - } - ], - "title": "Controller Version", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "awx_prometheus" - }, - "fieldConfig": { - "defaults": { - "displayName": "Instances", - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "light-blue", - "value": null - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 5, - "x": 12, - "y": 5 - }, - "id": 13, - "options": { - "colorMode": "background", - "graphMode": "none", - "justifyMode": "center", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto" - }, - "pluginVersion": "9.1.6", - "targets": [ { "datasource": { "type": "prometheus", "uid": "awx_prometheus" }, - "editorMode": "code", - "expr": "count(awx_instance_info)", - "interval": "", - "legendFormat": " ", - "range": true, - "refId": "A" + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 40, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "awx_prometheus" + }, + "editorMode": "builder", + "expr": "dispatcher_pool_max_worker_count", + "legendFormat": "__auto", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "awx_prometheus" + }, + "editorMode": "builder", + "expr": "dispatcher_pool_active_task_count", + "hide": false, + "legendFormat": "__auto", + "range": true, + "refId": "B" + } + ], + "title": "Dispatcher Workers", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "awx_prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 41, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "awx_prometheus" + }, + "editorMode": "builder", + "expr": "dispatcher_pool_scale_up_events", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Dispatcher Pool Scale-Up Events", + "type": "timeseries" } ], - "title": "Controller Node Count", - "type": "stat" + "title": "Dispatcher", + "type": "row" }, { "collapsed": true, @@ -274,7 +338,248 @@ "h": 1, "w": 24, "x": 0, - "y": 9 + "y": 1 + }, + "id": 37, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "awx_prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 26 + }, + "id": 14, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "awx_prometheus" + }, + "editorMode": "builder", + "expr": "awx_database_connections_total", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Database", + "type": "timeseries" + }, + { + "datasource": {}, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "light-blue", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 5, + "x": 12, + "y": 26 + }, + "id": 25, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "/^tower_version$/", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "000000021" + }, + "editorMode": "code", + "exemplar": false, + "expr": "awx_system_info", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Controller Version", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "awx_prometheus" + }, + "fieldConfig": { + "defaults": { + "displayName": "Instances", + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "light-blue", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 5, + "x": 12, + "y": 30 + }, + "id": 13, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "awx_prometheus" + }, + "editorMode": "code", + "expr": "count(awx_instance_info)", + "interval": "", + "legendFormat": " ", + "range": true, + "refId": "A" + } + ], + "title": "Controller Node Count", + "type": "stat" + } + ], + "title": "System", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 2 }, "id": 35, "panels": [ @@ -385,7 +690,7 @@ "h": 8, "w": 12, "x": 0, - "y": 10 + "y": 26 }, "id": 8, "options": { @@ -523,7 +828,7 @@ "h": 8, "w": 12, "x": 12, - "y": 10 + "y": 26 }, "id": 29, "options": { @@ -616,7 +921,7 @@ "h": 8, "w": 12, "x": 0, - "y": 18 + "y": 34 }, "id": 16, "options": { @@ -740,7 +1045,7 @@ "h": 8, "w": 12, "x": 12, - "y": 18 + "y": 34 }, "id": 18, "options": { @@ -840,7 +1145,7 @@ "h": 6, "w": 12, "x": 0, - "y": 26 + "y": 42 }, "id": 27, "options": { @@ -932,7 +1237,7 @@ "h": 8, "w": 12, "x": 12, - "y": 26 + "y": 42 }, "id": 20, "options": { @@ -973,7 +1278,7 @@ "h": 1, "w": 24, "x": 0, - "y": 10 + "y": 3 }, "id": 33, "panels": [ @@ -1022,7 +1327,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -1064,7 +1370,7 @@ "h": 8, "w": 12, "x": 0, - "y": 2 + "y": 27 }, "id": 12, "options": { @@ -1164,7 +1470,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -1179,7 +1486,7 @@ "h": 8, "w": 12, "x": 12, - "y": 2 + "y": 27 }, "id": 10, "options": { @@ -1268,7 +1575,7 @@ "h": 1, "w": 24, "x": 0, - "y": 11 + "y": 4 }, "id": 31, "panels": [ @@ -1336,7 +1643,7 @@ "h": 8, "w": 12, "x": 0, - "y": 36 + "y": 28 }, "id": 26, "options": { @@ -1455,7 +1762,7 @@ "h": 8, "w": 12, "x": 12, - "y": 36 + "y": 28 }, "id": 24, "options": { @@ -1504,7 +1811,7 @@ } ], "refresh": "5s", - "schemaVersion": 37, + "schemaVersion": 38, "style": "dark", "tags": [], "templating": { @@ -1518,6 +1825,6 @@ "timezone": "", "title": "awx-demo", "uid": "GISWZOXnk", - "version": 12, + "version": 13, "weekStart": "" }