mirror of
https://github.com/ansible/awx.git
synced 2026-05-24 17:17:45 -02:30
Add subsystem metrics for the dispatcher (#13989)
This adds a handful of metrics to /api/v2/metrics/ recorded from the dispatcher main process Adds logic in the dispatcher period tasks to calculate these for the last collection interval Reports worker count, task count, scale up events, and availability Add data to demo grafana dashboard
This commit is contained in:
@@ -209,6 +209,11 @@ class Metrics:
|
||||
SetFloatM('workflow_manager_recorded_timestamp', 'Unix timestamp when metrics were last recorded'),
|
||||
SetFloatM('workflow_manager_spawn_workflow_graph_jobs_seconds', 'Time spent spawning workflow tasks'),
|
||||
SetFloatM('workflow_manager_get_tasks_seconds', 'Time spent loading workflow tasks from db'),
|
||||
# dispatcher subsystem metrics
|
||||
SetIntM('dispatcher_pool_scale_up_events', 'Number of times local dispatcher scaled up a worker since startup'),
|
||||
SetIntM('dispatcher_pool_active_task_count', 'Number of active tasks in the worker pool when last task was submitted'),
|
||||
SetIntM('dispatcher_pool_max_worker_count', 'Highest number of workers in worker pool in last collection interval, about 20s'),
|
||||
SetFloatM('dispatcher_availability', 'Fraction of time (in last collection interval) dispatcher was able to receive messages'),
|
||||
]
|
||||
# turn metric list into dictionary with the metric name as a key
|
||||
self.METRICS = {}
|
||||
|
||||
@@ -339,6 +339,17 @@ class AutoscalePool(WorkerPool):
|
||||
# but if the task takes longer than the time defined here, we will force it to stop here
|
||||
self.task_manager_timeout = settings.TASK_MANAGER_TIMEOUT + settings.TASK_MANAGER_TIMEOUT_GRACE_PERIOD
|
||||
|
||||
# initialize some things for subsystem metrics periodic gathering
|
||||
# the AutoscalePool class does not save these to redis directly, but reports via produce_subsystem_metrics
|
||||
self.scale_up_ct = 0
|
||||
self.worker_count_max = 0
|
||||
|
||||
def produce_subsystem_metrics(self, metrics_object):
|
||||
metrics_object.set('dispatcher_pool_scale_up_events', self.scale_up_ct)
|
||||
metrics_object.set('dispatcher_pool_active_task_count', sum(len(w.managed_tasks) for w in self.workers))
|
||||
metrics_object.set('dispatcher_pool_max_worker_count', self.worker_count_max)
|
||||
self.worker_count_max = len(self.workers)
|
||||
|
||||
@property
|
||||
def should_grow(self):
|
||||
if len(self.workers) < self.min_workers:
|
||||
@@ -443,7 +454,12 @@ class AutoscalePool(WorkerPool):
|
||||
idx = random.choice(range(len(self.workers)))
|
||||
return idx, self.workers[idx]
|
||||
else:
|
||||
return super(AutoscalePool, self).up()
|
||||
self.scale_up_ct += 1
|
||||
ret = super(AutoscalePool, self).up()
|
||||
new_worker_ct = len(self.workers)
|
||||
if new_worker_ct > self.worker_count_max:
|
||||
self.worker_count_max = new_worker_ct
|
||||
return ret
|
||||
|
||||
def write(self, preferred_queue, body):
|
||||
if 'guid' in body:
|
||||
|
||||
@@ -19,6 +19,7 @@ from awx.main.dispatch.pool import WorkerPool
|
||||
from awx.main.dispatch import pg_bus_conn
|
||||
from awx.main.utils.common import log_excess_runtime
|
||||
from awx.main.utils.db import set_connection_name
|
||||
import awx.main.analytics.subsystem_metrics as s_metrics
|
||||
|
||||
if 'run_callback_receiver' in sys.argv:
|
||||
logger = logging.getLogger('awx.main.commands.run_callback_receiver')
|
||||
@@ -154,17 +155,30 @@ class AWXConsumerPG(AWXConsumerBase):
|
||||
self.pg_max_wait = settings.DISPATCHER_DB_DOWNTOWN_TOLLERANCE
|
||||
# if no successful loops have ran since startup, then we should fail right away
|
||||
self.pg_is_down = True # set so that we fail if we get database errors on startup
|
||||
self.pg_down_time = time.time() - self.pg_max_wait # allow no grace period
|
||||
self.last_cleanup = time.time()
|
||||
init_time = time.time()
|
||||
self.pg_down_time = init_time - self.pg_max_wait # allow no grace period
|
||||
self.last_cleanup = init_time
|
||||
self.subsystem_metrics = s_metrics.Metrics(auto_pipe_execute=False)
|
||||
self.last_metrics_gather = init_time
|
||||
self.listen_cumulative_time = 0.0
|
||||
|
||||
def run_periodic_tasks(self):
|
||||
self.record_statistics() # maintains time buffer in method
|
||||
|
||||
if time.time() - self.last_cleanup > 60: # same as cluster_node_heartbeat
|
||||
current_time = time.time()
|
||||
if current_time - self.last_cleanup > 60: # same as cluster_node_heartbeat
|
||||
# NOTE: if we run out of database connections, it is important to still run cleanup
|
||||
# so that we scale down workers and free up connections
|
||||
self.pool.cleanup()
|
||||
self.last_cleanup = time.time()
|
||||
self.last_cleanup = current_time
|
||||
|
||||
# record subsystem metrics for the dispatcher
|
||||
if current_time - self.last_metrics_gather > 20:
|
||||
self.pool.produce_subsystem_metrics(self.subsystem_metrics)
|
||||
self.subsystem_metrics.set('dispatcher_availability', self.listen_cumulative_time / (current_time - self.last_metrics_gather))
|
||||
self.subsystem_metrics.pipe_execute()
|
||||
self.listen_cumulative_time = 0.0
|
||||
self.last_metrics_gather = current_time
|
||||
|
||||
def run(self, *args, **kwargs):
|
||||
super(AWXConsumerPG, self).run(*args, **kwargs)
|
||||
@@ -180,11 +194,14 @@ class AWXConsumerPG(AWXConsumerBase):
|
||||
if init is False:
|
||||
self.worker.on_start()
|
||||
init = True
|
||||
self.listen_start = time.time()
|
||||
for e in conn.events(yield_timeouts=True):
|
||||
self.listen_cumulative_time += time.time() - self.listen_start
|
||||
if e is not None:
|
||||
self.process_task(json.loads(e.payload))
|
||||
self.run_periodic_tasks()
|
||||
self.pg_is_down = False
|
||||
self.listen_start = time.time()
|
||||
if self.should_stop:
|
||||
return
|
||||
except psycopg2.InterfaceError:
|
||||
|
||||
@@ -29,17 +29,211 @@
|
||||
"liveNow": false,
|
||||
"panels": [
|
||||
{
|
||||
"collapsed": false,
|
||||
"collapsed": true,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 37,
|
||||
"panels": [],
|
||||
"title": "System",
|
||||
"type": "row"
|
||||
"id": 38,
|
||||
"panels": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "awx_prometheus"
|
||||
},
|
||||
"description": "Fraction of time dispatcher is listening for new messages",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 1
|
||||
},
|
||||
"id": 39,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "awx_prometheus"
|
||||
},
|
||||
"editorMode": "builder",
|
||||
"expr": "dispatcher_availability",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Dispatcher Availability",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "awx_prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 1
|
||||
},
|
||||
"id": 40,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "awx_prometheus"
|
||||
},
|
||||
"editorMode": "builder",
|
||||
"expr": "dispatcher_pool_max_worker_count",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "awx_prometheus"
|
||||
},
|
||||
"editorMode": "builder",
|
||||
"expr": "dispatcher_pool_active_task_count",
|
||||
"hide": false,
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"title": "Dispatcher Workers",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
@@ -102,8 +296,115 @@
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 9
|
||||
},
|
||||
"id": 41,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "9.5.2",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "awx_prometheus"
|
||||
},
|
||||
"editorMode": "builder",
|
||||
"expr": "dispatcher_pool_scale_up_events",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Dispatcher Pool Scale-Up Events",
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"title": "Dispatcher",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"collapsed": true,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 1
|
||||
},
|
||||
"id": 37,
|
||||
"panels": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "awx_prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 26
|
||||
},
|
||||
"id": 14,
|
||||
"options": {
|
||||
"legend": {
|
||||
@@ -169,7 +470,7 @@
|
||||
"h": 4,
|
||||
"w": 5,
|
||||
"x": 12,
|
||||
"y": 1
|
||||
"y": 26
|
||||
},
|
||||
"id": 25,
|
||||
"links": [],
|
||||
@@ -188,7 +489,7 @@
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.1.6",
|
||||
"pluginVersion": "9.5.2",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
@@ -233,7 +534,7 @@
|
||||
"h": 4,
|
||||
"w": 5,
|
||||
"x": 12,
|
||||
"y": 5
|
||||
"y": 30
|
||||
},
|
||||
"id": 13,
|
||||
"options": {
|
||||
@@ -250,7 +551,7 @@
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.1.6",
|
||||
"pluginVersion": "9.5.2",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
@@ -267,6 +568,10 @@
|
||||
],
|
||||
"title": "Controller Node Count",
|
||||
"type": "stat"
|
||||
}
|
||||
],
|
||||
"title": "System",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"collapsed": true,
|
||||
@@ -274,7 +579,7 @@
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 9
|
||||
"y": 2
|
||||
},
|
||||
"id": 35,
|
||||
"panels": [
|
||||
@@ -385,7 +690,7 @@
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 10
|
||||
"y": 26
|
||||
},
|
||||
"id": 8,
|
||||
"options": {
|
||||
@@ -523,7 +828,7 @@
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 10
|
||||
"y": 26
|
||||
},
|
||||
"id": 29,
|
||||
"options": {
|
||||
@@ -616,7 +921,7 @@
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 18
|
||||
"y": 34
|
||||
},
|
||||
"id": 16,
|
||||
"options": {
|
||||
@@ -740,7 +1045,7 @@
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 18
|
||||
"y": 34
|
||||
},
|
||||
"id": 18,
|
||||
"options": {
|
||||
@@ -840,7 +1145,7 @@
|
||||
"h": 6,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 26
|
||||
"y": 42
|
||||
},
|
||||
"id": 27,
|
||||
"options": {
|
||||
@@ -932,7 +1237,7 @@
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 26
|
||||
"y": 42
|
||||
},
|
||||
"id": 20,
|
||||
"options": {
|
||||
@@ -973,7 +1278,7 @@
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 10
|
||||
"y": 3
|
||||
},
|
||||
"id": 33,
|
||||
"panels": [
|
||||
@@ -1022,7 +1327,8 @@
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green"
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
@@ -1064,7 +1370,7 @@
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 2
|
||||
"y": 27
|
||||
},
|
||||
"id": 12,
|
||||
"options": {
|
||||
@@ -1164,7 +1470,8 @@
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green"
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
@@ -1179,7 +1486,7 @@
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 2
|
||||
"y": 27
|
||||
},
|
||||
"id": 10,
|
||||
"options": {
|
||||
@@ -1268,7 +1575,7 @@
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 11
|
||||
"y": 4
|
||||
},
|
||||
"id": 31,
|
||||
"panels": [
|
||||
@@ -1336,7 +1643,7 @@
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 36
|
||||
"y": 28
|
||||
},
|
||||
"id": 26,
|
||||
"options": {
|
||||
@@ -1455,7 +1762,7 @@
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 36
|
||||
"y": 28
|
||||
},
|
||||
"id": 24,
|
||||
"options": {
|
||||
@@ -1504,7 +1811,7 @@
|
||||
}
|
||||
],
|
||||
"refresh": "5s",
|
||||
"schemaVersion": 37,
|
||||
"schemaVersion": 38,
|
||||
"style": "dark",
|
||||
"tags": [],
|
||||
"templating": {
|
||||
@@ -1518,6 +1825,6 @@
|
||||
"timezone": "",
|
||||
"title": "awx-demo",
|
||||
"uid": "GISWZOXnk",
|
||||
"version": 12,
|
||||
"version": 13,
|
||||
"weekStart": ""
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user