Add subsystem metrics for the dispatcher (#13989)

This adds a handful of metrics to /api/v2/metrics/ recorded from the dispatcher main process

Adds logic in the dispatcher period tasks to calculate these for the last collection interval
Reports worker count, task count, scale up events, and availability

Add data to demo grafana dashboard
This commit is contained in:
Alan Rominger
2023-05-17 14:29:31 -04:00
committed by GitHub
parent 84f67c7f82
commit ef99770383
4 changed files with 582 additions and 237 deletions

View File

@@ -339,6 +339,17 @@ class AutoscalePool(WorkerPool):
# but if the task takes longer than the time defined here, we will force it to stop here
self.task_manager_timeout = settings.TASK_MANAGER_TIMEOUT + settings.TASK_MANAGER_TIMEOUT_GRACE_PERIOD
# initialize some things for subsystem metrics periodic gathering
# the AutoscalePool class does not save these to redis directly, but reports via produce_subsystem_metrics
self.scale_up_ct = 0
self.worker_count_max = 0
def produce_subsystem_metrics(self, metrics_object):
metrics_object.set('dispatcher_pool_scale_up_events', self.scale_up_ct)
metrics_object.set('dispatcher_pool_active_task_count', sum(len(w.managed_tasks) for w in self.workers))
metrics_object.set('dispatcher_pool_max_worker_count', self.worker_count_max)
self.worker_count_max = len(self.workers)
@property
def should_grow(self):
if len(self.workers) < self.min_workers:
@@ -443,7 +454,12 @@ class AutoscalePool(WorkerPool):
idx = random.choice(range(len(self.workers)))
return idx, self.workers[idx]
else:
return super(AutoscalePool, self).up()
self.scale_up_ct += 1
ret = super(AutoscalePool, self).up()
new_worker_ct = len(self.workers)
if new_worker_ct > self.worker_count_max:
self.worker_count_max = new_worker_ct
return ret
def write(self, preferred_queue, body):
if 'guid' in body: