Merge pull request #9671 from fosterseth/fix_4602_pending_jobs_incorrect_metrics

Fix api/v2/metrics data displaying incorrect value

SUMMARY

How to reproduce bug

Disable all instances
Queue up 5 jobs, (if using same JT, enable concurrent jobs)
Enable instance so jobs will start running
Refresh /api/v2/metrics endpoint. You should see a metric that says awx_status_total{status="pending"} 5.0. Once you see this, don't refresh again.
Wait for all jobs to finish.
Now start refreshing the api/v2/metrics endpoint. Every once in a while you will see this pending jobs metric show (awx_status_total{status="pending"} 5.0) even though there are no jobs in pending state

Fix
Use a locally defined prometheus registry instead of a global registry. Each time the endpoint is refreshed, a new, local registry is set up with prometheus objects (Gauge, Info). Since we aren't actually incrementing these metrics across api calls, we should be safe. That is to say, we just lookup the values from the database and set the prometheus values explicitly.

ISSUE TYPE


Bugfix Pull Request

COMPONENT NAME


API

AWX VERSION

awx: 18.0.0

Reviewed-by: Ryan Petrello <None>
This commit is contained in:
softwarefactory-project-zuul[bot] 2021-03-25 19:10:04 +00:00 committed by GitHub
commit 690045c8e0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1,5 +1,5 @@
from django.conf import settings
from prometheus_client import REGISTRY, PROCESS_COLLECTOR, PLATFORM_COLLECTOR, GC_COLLECTOR, Gauge, Info, generate_latest
from prometheus_client import PROCESS_COLLECTOR, PLATFORM_COLLECTOR, GC_COLLECTOR, CollectorRegistry, Gauge, Info, generate_latest
from awx.conf.license import get_license
from awx.main.utils import get_awx_version, get_ansible_version
@ -11,115 +11,123 @@ from awx.main.analytics.collectors import (
)
REGISTRY.unregister(PROCESS_COLLECTOR)
REGISTRY.unregister(PLATFORM_COLLECTOR)
REGISTRY.unregister(GC_COLLECTOR)
SYSTEM_INFO = Info('awx_system', 'AWX System Information')
ORG_COUNT = Gauge('awx_organizations_total', 'Number of organizations')
USER_COUNT = Gauge('awx_users_total', 'Number of users')
TEAM_COUNT = Gauge('awx_teams_total', 'Number of teams')
INV_COUNT = Gauge('awx_inventories_total', 'Number of inventories')
PROJ_COUNT = Gauge('awx_projects_total', 'Number of projects')
JT_COUNT = Gauge('awx_job_templates_total', 'Number of job templates')
WFJT_COUNT = Gauge('awx_workflow_job_templates_total', 'Number of workflow job templates')
HOST_COUNT = Gauge(
'awx_hosts_total',
'Number of hosts',
[
'type',
],
)
SCHEDULE_COUNT = Gauge('awx_schedules_total', 'Number of schedules')
INV_SCRIPT_COUNT = Gauge('awx_inventory_scripts_total', 'Number of invetory scripts')
USER_SESSIONS = Gauge(
'awx_sessions_total',
'Number of sessions',
[
'type',
],
)
CUSTOM_VENVS = Gauge('awx_custom_virtualenvs_total', 'Number of virtualenvs')
RUNNING_JOBS = Gauge('awx_running_jobs_total', 'Number of running jobs on the Tower system')
PENDING_JOBS = Gauge('awx_pending_jobs_total', 'Number of pending jobs on the Tower system')
STATUS = Gauge(
'awx_status_total',
'Status of Job launched',
[
'status',
],
)
INSTANCE_CAPACITY = Gauge(
'awx_instance_capacity',
'Capacity of each node in a Tower system',
[
'hostname',
'instance_uuid',
],
)
INSTANCE_CPU = Gauge(
'awx_instance_cpu',
'CPU cores on each node in a Tower system',
[
'hostname',
'instance_uuid',
],
)
INSTANCE_MEMORY = Gauge(
'awx_instance_memory',
'RAM (Kb) on each node in a Tower system',
[
'hostname',
'instance_uuid',
],
)
INSTANCE_INFO = Info(
'awx_instance',
'Info about each node in a Tower system',
[
'hostname',
'instance_uuid',
],
)
INSTANCE_LAUNCH_TYPE = Gauge(
'awx_instance_launch_type_total',
'Type of Job launched',
[
'node',
'launch_type',
],
)
INSTANCE_STATUS = Gauge(
'awx_instance_status_total',
'Status of Job launched',
[
'node',
'status',
],
)
INSTANCE_CONSUMED_CAPACITY = Gauge(
'awx_instance_consumed_capacity',
'Consumed capacity of each node in a Tower system',
[
'hostname',
'instance_uuid',
],
)
INSTANCE_REMAINING_CAPACITY = Gauge(
'awx_instance_remaining_capacity',
'Remaining capacity of each node in a Tower system',
[
'hostname',
'instance_uuid',
],
)
LICENSE_INSTANCE_TOTAL = Gauge('awx_license_instance_total', 'Total number of managed hosts provided by your license')
LICENSE_INSTANCE_FREE = Gauge('awx_license_instance_free', 'Number of remaining managed hosts provided by your license')
def metrics():
REGISTRY = CollectorRegistry()
SYSTEM_INFO = Info('awx_system', 'AWX System Information', registry=REGISTRY)
ORG_COUNT = Gauge('awx_organizations_total', 'Number of organizations', registry=REGISTRY)
USER_COUNT = Gauge('awx_users_total', 'Number of users', registry=REGISTRY)
TEAM_COUNT = Gauge('awx_teams_total', 'Number of teams', registry=REGISTRY)
INV_COUNT = Gauge('awx_inventories_total', 'Number of inventories', registry=REGISTRY)
PROJ_COUNT = Gauge('awx_projects_total', 'Number of projects', registry=REGISTRY)
JT_COUNT = Gauge('awx_job_templates_total', 'Number of job templates', registry=REGISTRY)
WFJT_COUNT = Gauge('awx_workflow_job_templates_total', 'Number of workflow job templates', registry=REGISTRY)
HOST_COUNT = Gauge(
'awx_hosts_total',
'Number of hosts',
[
'type',
],
registry=REGISTRY,
)
SCHEDULE_COUNT = Gauge('awx_schedules_total', 'Number of schedules', registry=REGISTRY)
INV_SCRIPT_COUNT = Gauge('awx_inventory_scripts_total', 'Number of invetory scripts', registry=REGISTRY)
USER_SESSIONS = Gauge(
'awx_sessions_total',
'Number of sessions',
[
'type',
],
registry=REGISTRY,
)
CUSTOM_VENVS = Gauge('awx_custom_virtualenvs_total', 'Number of virtualenvs', registry=REGISTRY)
RUNNING_JOBS = Gauge('awx_running_jobs_total', 'Number of running jobs on the Tower system', registry=REGISTRY)
PENDING_JOBS = Gauge('awx_pending_jobs_total', 'Number of pending jobs on the Tower system', registry=REGISTRY)
STATUS = Gauge(
'awx_status_total',
'Status of Job launched',
[
'status',
],
registry=REGISTRY,
)
INSTANCE_CAPACITY = Gauge(
'awx_instance_capacity',
'Capacity of each node in a Tower system',
[
'hostname',
'instance_uuid',
],
registry=REGISTRY,
)
INSTANCE_CPU = Gauge(
'awx_instance_cpu',
'CPU cores on each node in a Tower system',
[
'hostname',
'instance_uuid',
],
registry=REGISTRY,
)
INSTANCE_MEMORY = Gauge(
'awx_instance_memory',
'RAM (Kb) on each node in a Tower system',
[
'hostname',
'instance_uuid',
],
registry=REGISTRY,
)
INSTANCE_INFO = Info(
'awx_instance',
'Info about each node in a Tower system',
[
'hostname',
'instance_uuid',
],
registry=REGISTRY,
)
INSTANCE_LAUNCH_TYPE = Gauge(
'awx_instance_launch_type_total',
'Type of Job launched',
[
'node',
'launch_type',
],
registry=REGISTRY,
)
INSTANCE_STATUS = Gauge(
'awx_instance_status_total',
'Status of Job launched',
[
'node',
'status',
],
registry=REGISTRY,
)
INSTANCE_CONSUMED_CAPACITY = Gauge(
'awx_instance_consumed_capacity',
'Consumed capacity of each node in a Tower system',
[
'hostname',
'instance_uuid',
],
registry=REGISTRY,
)
INSTANCE_REMAINING_CAPACITY = Gauge(
'awx_instance_remaining_capacity',
'Remaining capacity of each node in a Tower system',
[
'hostname',
'instance_uuid',
],
registry=REGISTRY,
)
LICENSE_INSTANCE_TOTAL = Gauge('awx_license_instance_total', 'Total number of managed hosts provided by your license', registry=REGISTRY)
LICENSE_INSTANCE_FREE = Gauge('awx_license_instance_free', 'Number of remaining managed hosts provided by your license', registry=REGISTRY)
license_info = get_license()
SYSTEM_INFO.info(
{
@ -197,7 +205,7 @@ def metrics():
for status, value in statuses.items():
INSTANCE_STATUS.labels(node=node, status=status).set(value)
return generate_latest()
return generate_latest(registry=REGISTRY)
__all__ = ['metrics']