mirror of
https://github.com/ansible/awx.git
synced 2026-05-19 23:07:42 -02:30
Add subsystem metrics for task manager
This commit is contained in:
@@ -8,7 +8,7 @@ from django.apps import apps
|
||||
from awx.main.consumers import emit_channel_notification
|
||||
|
||||
root_key = 'awx_metrics'
|
||||
logger = logging.getLogger('awx.main.wsbroadcast')
|
||||
logger = logging.getLogger('awx.main.analytics')
|
||||
|
||||
|
||||
class BaseM:
|
||||
@@ -16,16 +16,22 @@ class BaseM:
|
||||
self.field = field
|
||||
self.help_text = help_text
|
||||
self.current_value = 0
|
||||
self.metric_has_changed = False
|
||||
|
||||
def clear_value(self, conn):
|
||||
def reset_value(self, conn):
|
||||
conn.hset(root_key, self.field, 0)
|
||||
self.current_value = 0
|
||||
|
||||
def inc(self, value):
|
||||
self.current_value += value
|
||||
self.metric_has_changed = True
|
||||
|
||||
def set(self, value):
|
||||
self.current_value = value
|
||||
self.metric_has_changed = True
|
||||
|
||||
def get(self):
|
||||
return self.current_value
|
||||
|
||||
def decode(self, conn):
|
||||
value = conn.hget(root_key, self.field)
|
||||
@@ -34,7 +40,7 @@ class BaseM:
|
||||
def to_prometheus(self, instance_data):
|
||||
output_text = f"# HELP {self.field} {self.help_text}\n# TYPE {self.field} gauge\n"
|
||||
for instance in instance_data:
|
||||
output_text += f'{self.field}{{node="{instance}"}} {instance_data[instance].get(self.field, -1)}\n' # TODO: fix because this -1 is neccessary when dealing with old instances (ex. you didn't clean up your database)
|
||||
output_text += f'{self.field}{{node="{instance}"}} {instance_data[instance][self.field]}\n'
|
||||
return output_text
|
||||
|
||||
|
||||
@@ -46,8 +52,10 @@ class FloatM(BaseM):
|
||||
return 0.0
|
||||
|
||||
def store_value(self, conn):
|
||||
conn.hincrbyfloat(root_key, self.field, self.current_value)
|
||||
self.current_value = 0
|
||||
if self.metric_has_changed:
|
||||
conn.hincrbyfloat(root_key, self.field, self.current_value)
|
||||
self.current_value = 0
|
||||
self.metric_has_changed = False
|
||||
|
||||
|
||||
class IntM(BaseM):
|
||||
@@ -58,8 +66,10 @@ class IntM(BaseM):
|
||||
return 0
|
||||
|
||||
def store_value(self, conn):
|
||||
conn.hincrby(root_key, self.field, self.current_value)
|
||||
self.current_value = 0
|
||||
if self.metric_has_changed:
|
||||
conn.hincrby(root_key, self.field, self.current_value)
|
||||
self.current_value = 0
|
||||
self.metric_has_changed = False
|
||||
|
||||
|
||||
class SetIntM(BaseM):
|
||||
@@ -70,10 +80,9 @@ class SetIntM(BaseM):
|
||||
return 0
|
||||
|
||||
def store_value(self, conn):
|
||||
# do not set value if it has not changed since last time this was called
|
||||
if self.current_value is not None:
|
||||
if self.metric_has_changed:
|
||||
conn.hset(root_key, self.field, self.current_value)
|
||||
self.current_value = None
|
||||
self.metric_has_changed = False
|
||||
|
||||
|
||||
class SetFloatM(SetIntM):
|
||||
@@ -94,13 +103,13 @@ class HistogramM(BaseM):
|
||||
self.sum = IntM(field + '_sum', '')
|
||||
super(HistogramM, self).__init__(field, help_text)
|
||||
|
||||
def clear_value(self, conn):
|
||||
def reset_value(self, conn):
|
||||
conn.hset(root_key, self.field, 0)
|
||||
self.inf.clear_value(conn)
|
||||
self.sum.clear_value(conn)
|
||||
self.inf.reset_value(conn)
|
||||
self.sum.reset_value(conn)
|
||||
for b in self.buckets_to_keys.values():
|
||||
b.clear_value(conn)
|
||||
super(HistogramM, self).clear_value(conn)
|
||||
b.reset_value(conn)
|
||||
super(HistogramM, self).reset_value(conn)
|
||||
|
||||
def observe(self, value):
|
||||
for b in self.buckets:
|
||||
@@ -136,7 +145,7 @@ class HistogramM(BaseM):
|
||||
|
||||
|
||||
class Metrics:
|
||||
def __init__(self, auto_pipe_execute=True, instance_name=None):
|
||||
def __init__(self, auto_pipe_execute=False, instance_name=None):
|
||||
self.pipe = redis.Redis.from_url(settings.BROKER_URL).pipeline()
|
||||
self.conn = redis.Redis.from_url(settings.BROKER_URL)
|
||||
self.last_pipe_execute = time.time()
|
||||
@@ -152,6 +161,8 @@ class Metrics:
|
||||
Instance = apps.get_model('main', 'Instance')
|
||||
if instance_name:
|
||||
self.instance_name = instance_name
|
||||
elif settings.IS_TESTING():
|
||||
self.instance_name = "awx_testing"
|
||||
else:
|
||||
self.instance_name = Instance.objects.me().hostname
|
||||
|
||||
@@ -167,10 +178,23 @@ class Metrics:
|
||||
HistogramM(
|
||||
'callback_receiver_batch_events_insert_db', 'Number of events batch inserted into database', settings.SUBSYSTEM_METRICS_BATCH_INSERT_BUCKETS
|
||||
),
|
||||
SetFloatM('callback_receiver_event_processing_avg_seconds', 'Average processing time per event per callback receiver batch'),
|
||||
FloatM('subsystem_metrics_pipe_execute_seconds', 'Time spent saving metrics to redis'),
|
||||
IntM('subsystem_metrics_pipe_execute_calls', 'Number of calls to pipe_execute'),
|
||||
FloatM('subsystem_metrics_send_metrics_seconds', 'Time spent sending metrics to other nodes'),
|
||||
SetFloatM('callback_receiver_event_processing_avg_seconds', 'Average processing time per event per callback receiver batch'),
|
||||
SetFloatM('task_manager_get_tasks_seconds', 'Time spent in loading all tasks from db'),
|
||||
SetFloatM('task_manager_start_task_seconds', 'Time spent starting task'),
|
||||
SetFloatM('task_manager_process_running_tasks_seconds', 'Time spent processing running tasks'),
|
||||
SetFloatM('task_manager_process_pending_tasks_seconds', 'Time spent processing pending tasks'),
|
||||
SetFloatM('task_manager_generate_dependencies_seconds', 'Time spent generating dependencies for pending tasks'),
|
||||
SetFloatM('task_manager_spawn_workflow_graph_jobs_seconds', 'Time spent spawning workflow jobs'),
|
||||
SetFloatM('task_manager__schedule_seconds', 'Time spent in running the entire _schedule'),
|
||||
IntM('task_manager_schedule_calls', 'Number of calls to task manager schedule'),
|
||||
SetFloatM('task_manager_recorded_timestamp', 'Unix timestamp when metrics were last recorded'),
|
||||
SetIntM('task_manager_tasks_started', 'Number of tasks started'),
|
||||
SetIntM('task_manager_running_processed', 'Number of running tasks processed'),
|
||||
SetIntM('task_manager_pending_processed', 'Number of pending tasks processed'),
|
||||
SetIntM('task_manager_tasks_blocked', 'Number of tasks blocked from running'),
|
||||
]
|
||||
# turn metric list into dictionary with the metric name as a key
|
||||
self.METRICS = {}
|
||||
@@ -180,11 +204,11 @@ class Metrics:
|
||||
# track last time metrics were sent to other nodes
|
||||
self.previous_send_metrics = SetFloatM('send_metrics_time', 'Timestamp of previous send_metrics call')
|
||||
|
||||
def clear_values(self, fields=None):
|
||||
if not fields:
|
||||
fields = self.METRICS.keys()
|
||||
for m in fields:
|
||||
self.METRICS[m].clear_value(self.conn)
|
||||
def reset_values(self):
|
||||
# intended to be called once on app startup to reset all metric
|
||||
# values to 0
|
||||
for m in self.METRICS.values():
|
||||
m.reset_value(self.conn)
|
||||
self.metrics_have_changed = True
|
||||
self.conn.delete(root_key + "_lock")
|
||||
|
||||
@@ -192,19 +216,25 @@ class Metrics:
|
||||
if value != 0:
|
||||
self.METRICS[field].inc(value)
|
||||
self.metrics_have_changed = True
|
||||
if self.auto_pipe_execute is True and self.should_pipe_execute() is True:
|
||||
if self.auto_pipe_execute is True:
|
||||
self.pipe_execute()
|
||||
|
||||
def set(self, field, value):
|
||||
self.METRICS[field].set(value)
|
||||
self.metrics_have_changed = True
|
||||
if self.auto_pipe_execute is True and self.should_pipe_execute() is True:
|
||||
if self.auto_pipe_execute is True:
|
||||
self.pipe_execute()
|
||||
|
||||
def get(self, field):
|
||||
return self.METRICS[field].get()
|
||||
|
||||
def decode(self, field):
|
||||
return self.METRICS[field].decode(self.conn)
|
||||
|
||||
def observe(self, field, value):
|
||||
self.METRICS[field].observe(value)
|
||||
self.metrics_have_changed = True
|
||||
if self.auto_pipe_execute is True and self.should_pipe_execute() is True:
|
||||
if self.auto_pipe_execute is True:
|
||||
self.pipe_execute()
|
||||
|
||||
def serialize_local_metrics(self):
|
||||
@@ -252,8 +282,8 @@ class Metrics:
|
||||
|
||||
def send_metrics(self):
|
||||
# more than one thread could be calling this at the same time, so should
|
||||
# get acquire redis lock before sending metrics
|
||||
lock = self.conn.lock(root_key + '_lock', thread_local=False)
|
||||
# acquire redis lock before sending metrics
|
||||
lock = self.conn.lock(root_key + '_lock')
|
||||
if not lock.acquire(blocking=False):
|
||||
return
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user