Fix duplicate metrics in AWX subsystem_metrics (#15964)

Separate out operation subsystem metrics to fix duplicate error

Remove unnecessary comments

Revert to single subsystem_metrics_* metric with labels

Format via black
This commit is contained in:
Chris Coutinho
2025-10-09 10:28:55 +02:00
committed by GitHub
parent 0d18308112
commit 612e8e7688

View File

@@ -44,11 +44,12 @@ class MetricsServer(MetricsServerSettings):
class BaseM: class BaseM:
def __init__(self, field, help_text): def __init__(self, field, help_text, labels=None):
self.field = field self.field = field
self.help_text = help_text self.help_text = help_text
self.current_value = 0 self.current_value = 0
self.metric_has_changed = False self.metric_has_changed = False
self.labels = labels or {}
def reset_value(self, conn): def reset_value(self, conn):
conn.hset(root_key, self.field, 0) conn.hset(root_key, self.field, 0)
@@ -69,12 +70,16 @@ class BaseM:
value = conn.hget(root_key, self.field) value = conn.hget(root_key, self.field)
return self.decode_value(value) return self.decode_value(value)
def to_prometheus(self, instance_data): def to_prometheus(self, instance_data, namespace=None):
output_text = f"# HELP {self.field} {self.help_text}\n# TYPE {self.field} gauge\n" output_text = f"# HELP {self.field} {self.help_text}\n# TYPE {self.field} gauge\n"
for instance in instance_data: for instance in instance_data:
if self.field in instance_data[instance]: if self.field in instance_data[instance]:
# Build label string
labels = f'node="{instance}"'
if namespace:
labels += f',subsystem="{namespace}"'
# on upgrade, if there are stale instances, we can end up with issues where new metrics are not present # on upgrade, if there are stale instances, we can end up with issues where new metrics are not present
output_text += f'{self.field}{{node="{instance}"}} {instance_data[instance][self.field]}\n' output_text += f'{self.field}{{{labels}}} {instance_data[instance][self.field]}\n'
return output_text return output_text
@@ -167,14 +172,17 @@ class HistogramM(BaseM):
self.sum.store_value(conn) self.sum.store_value(conn)
self.inf.store_value(conn) self.inf.store_value(conn)
def to_prometheus(self, instance_data): def to_prometheus(self, instance_data, namespace=None):
output_text = f"# HELP {self.field} {self.help_text}\n# TYPE {self.field} histogram\n" output_text = f"# HELP {self.field} {self.help_text}\n# TYPE {self.field} histogram\n"
for instance in instance_data: for instance in instance_data:
# Build label string
node_label = f'node="{instance}"'
subsystem_label = f',subsystem="{namespace}"' if namespace else ''
for i, b in enumerate(self.buckets): for i, b in enumerate(self.buckets):
output_text += f'{self.field}_bucket{{le="{b}",node="{instance}"}} {sum(instance_data[instance][self.field]["counts"][0:i+1])}\n' output_text += f'{self.field}_bucket{{le="{b}",{node_label}{subsystem_label}}} {sum(instance_data[instance][self.field]["counts"][0:i+1])}\n'
output_text += f'{self.field}_bucket{{le="+Inf",node="{instance}"}} {instance_data[instance][self.field]["inf"]}\n' output_text += f'{self.field}_bucket{{le="+Inf",{node_label}{subsystem_label}}} {instance_data[instance][self.field]["inf"]}\n'
output_text += f'{self.field}_count{{node="{instance}"}} {instance_data[instance][self.field]["inf"]}\n' output_text += f'{self.field}_count{{{node_label}{subsystem_label}}} {instance_data[instance][self.field]["inf"]}\n'
output_text += f'{self.field}_sum{{node="{instance}"}} {instance_data[instance][self.field]["sum"]}\n' output_text += f'{self.field}_sum{{{node_label}{subsystem_label}}} {instance_data[instance][self.field]["sum"]}\n'
return output_text return output_text
@@ -273,20 +281,22 @@ class Metrics(MetricsNamespace):
def pipe_execute(self): def pipe_execute(self):
if self.metrics_have_changed is True: if self.metrics_have_changed is True:
duration_to_save = time.perf_counter() duration_pipe_exec = time.perf_counter()
for m in self.METRICS: for m in self.METRICS:
self.METRICS[m].store_value(self.pipe) self.METRICS[m].store_value(self.pipe)
self.pipe.execute() self.pipe.execute()
self.last_pipe_execute = time.time() self.last_pipe_execute = time.time()
self.metrics_have_changed = False self.metrics_have_changed = False
duration_to_save = time.perf_counter() - duration_to_save duration_pipe_exec = time.perf_counter() - duration_pipe_exec
self.METRICS['subsystem_metrics_pipe_execute_seconds'].inc(duration_to_save)
self.METRICS['subsystem_metrics_pipe_execute_calls'].inc(1)
duration_to_save = time.perf_counter() duration_send_metrics = time.perf_counter()
self.send_metrics() self.send_metrics()
duration_to_save = time.perf_counter() - duration_to_save duration_send_metrics = time.perf_counter() - duration_send_metrics
self.METRICS['subsystem_metrics_send_metrics_seconds'].inc(duration_to_save)
# Increment operational metrics
self.METRICS['subsystem_metrics_pipe_execute_seconds'].inc(duration_pipe_exec)
self.METRICS['subsystem_metrics_pipe_execute_calls'].inc(1)
self.METRICS['subsystem_metrics_send_metrics_seconds'].inc(duration_send_metrics)
def send_metrics(self): def send_metrics(self):
# more than one thread could be calling this at the same time, so should # more than one thread could be calling this at the same time, so should
@@ -352,7 +362,13 @@ class Metrics(MetricsNamespace):
if instance_data: if instance_data:
for field in self.METRICS: for field in self.METRICS:
if len(metrics_filter) == 0 or field in metrics_filter: if len(metrics_filter) == 0 or field in metrics_filter:
output_text += self.METRICS[field].to_prometheus(instance_data) # Add subsystem label only for operational metrics
namespace = (
self._namespace
if field in ['subsystem_metrics_pipe_execute_seconds', 'subsystem_metrics_pipe_execute_calls', 'subsystem_metrics_send_metrics_seconds']
else None
)
output_text += self.METRICS[field].to_prometheus(instance_data, namespace)
return output_text return output_text