initial prometheus commit

Co-authored-by: Wayne Witzel III <wayne@riotousliving.com>
Co-authored-by: Christian Adams <rooftopcellist@gmail.com>
This commit is contained in:
Wayne Witzel III 2019-04-05 15:46:54 -04:00 committed by Christian Adams
parent e9f2fddc7f
commit c3812de3d6
9 changed files with 247 additions and 6 deletions

15
awx/api/metrics.py Normal file
View File

@ -0,0 +1,15 @@
# Copyright (c) 2017 Ansible, Inc.
# All Rights Reserved.
from django.conf.urls import url
from awx.api.views import (
MetricsView
)
urls = [
url(r'^$', MetricsView.as_view(), name='metrics_view'),
]
__all__ = ['urls']

View File

@ -34,6 +34,8 @@ from awx.api.views import (
OAuth2ApplicationDetail,
)
from awx.api.views.metrics import MetricsView
from .organization import urls as organization_urls
from .user import urls as user_urls
from .project import urls as project_urls
@ -133,6 +135,7 @@ v2_urls = [
url(r'^applications/(?P<pk>[0-9]+)/tokens/$', ApplicationOAuth2TokenList.as_view(), name='application_o_auth2_token_list'),
url(r'^tokens/$', OAuth2TokenList.as_view(), name='o_auth2_token_list'),
url(r'^', include(oauth2_urls)),
url(r'^metrics/$', MetricsView.as_view(), name='metrics_view'),
]
app_name = 'api'

46
awx/api/views/metrics.py Normal file
View File

@ -0,0 +1,46 @@
# Copyright (c) 2018 Red Hat, Inc.
# All Rights Reserved.
# Python
import logging
# Django
from django.conf import settings
from django.utils.translation import ugettext_lazy as _
from django.utils.timezone import now
# Django REST Framework
from rest_framework.exceptions import PermissionDenied
from rest_framework.response import Response
from rest_framework.renderers import JSONRenderer, StaticHTMLRenderer
# AWX
# from awx.main.analytics import collectors
from awx.main.analytics.metrics import metrics
from awx.api import renderers
from awx.api.generics import (
APIView,
)
from awx.api.serializers import (
InventorySerializer,
ActivityStreamSerializer,
)
logger = logging.getLogger('awx.main.analytics')
class MetricsView(APIView):
view_name = _('Metrics')
swagger_topic = 'Metrics'
renderer_classes = [renderers.PlainTextRenderer,
renderers.BrowsableAPIRenderer,
JSONRenderer,]
def get(self, request, format='txt'):
''' Show Metrics Details '''
return Response(metrics().decode('UTF-8'))

View File

@ -104,6 +104,7 @@ class ApiVersionRootView(APIView):
data['credential_input_sources'] = reverse('api:credential_input_source_list', request=request)
data['applications'] = reverse('api:o_auth2_application_list', request=request)
data['tokens'] = reverse('api:o_auth2_token_list', request=request)
data['metrics'] = reverse('api:metrics_view', request=request)
data['inventory'] = reverse('api:inventory_list', request=request)
data['inventory_scripts'] = reverse('api:inventory_script_list', request=request)
data['inventory_sources'] = reverse('api:inventory_source_list', request=request)
@ -278,6 +279,3 @@ class ApiV1ConfigView(APIView):
except Exception:
# FIX: Log
return Response({"error": _("Failed to remove license.")}, status=status.HTTP_400_BAD_REQUEST)

View File

@ -158,7 +158,7 @@ def instance_info(since):
instances = models.Instance.objects.values_list('hostname').annotate().values(
'uuid', 'version', 'capacity', 'cpu', 'memory', 'managed_by_policy', 'hostname', 'last_isolated_check', 'enabled')
for instance in instances:
info = {'uuid': instance['uuid'],
instance_info = {'uuid': instance['uuid'],
'version': instance['version'],
'capacity': instance['capacity'],
'cpu': instance['cpu'],
@ -167,6 +167,7 @@ def instance_info(since):
'last_isolated_check': instance['last_isolated_check'],
'enabled': instance['enabled']
}
info[instance['uuid']] = instance_info
return info
@ -186,12 +187,12 @@ def job_instance_counts(since):
job_types = models.UnifiedJob.objects.exclude(launch_type='sync').values_list(
'execution_node', 'launch_type').annotate(job_launch_type=Count('launch_type'))
for job in job_types:
counts.setdefault(job[0], {}).setdefault('status', {})[job[1]] = job[2]
counts.setdefault(job[0], {}).setdefault('launch_type', {})[job[1]] = job[2]
job_statuses = models.UnifiedJob.objects.exclude(launch_type='sync').values_list(
'execution_node', 'status').annotate(job_status=Count('status'))
for job in job_statuses:
counts.setdefault(job[0], {}).setdefault('launch_type', {})[job[1]] = job[2]
counts.setdefault(job[0], {}).setdefault('status', {})[job[1]] = job[2]
return counts

View File

@ -0,0 +1,127 @@
import os
from datetime import datetime
from prometheus_client import (
REGISTRY,
PROCESS_COLLECTOR,
PLATFORM_COLLECTOR,
GC_COLLECTOR,
Gauge,
Info,
generate_latest
)
from django.contrib.sessions.models import Session
# Temporary Imports
from django.db import connection
from django.db.models import Count
from django.conf import settings
from awx.conf.license import get_license
from awx.main.utils import (get_awx_version, get_ansible_version,
get_custom_venv_choices)
from awx.main import models
from awx.main.analytics.collectors import (
counts,
instance_info,
job_instance_counts
)
from django.contrib.sessions.models import Session
from awx.main.analytics import register
REGISTRY.unregister(PROCESS_COLLECTOR)
REGISTRY.unregister(PLATFORM_COLLECTOR)
REGISTRY.unregister(GC_COLLECTOR)
SYSTEM_INFO = Info('awx_system', 'AWX System Information')
ORG_COUNT = Gauge('awx_organizations_total', 'Number of organizations')
USER_COUNT = Gauge('awx_users_total', 'Number of users')
TEAM_COUNT = Gauge('awx_teams_total', 'Number of teams')
INV_COUNT = Gauge('awx_inventories_total', 'Number of inventories')
PROJ_COUNT = Gauge('awx_projects_total', 'Number of projects')
JT_COUNT = Gauge('awx_job_templates_total', 'Number of job templates')
WFJT_COUNT = Gauge('awx_workflow_job_templates_total', 'Number of workflow job templates')
HOST_COUNT = Gauge('awx_hosts_total', 'Number of hosts', ['type',])
SCHEDULE_COUNT = Gauge('awx_schedules_total', 'Number of schedules')
INV_SCRIPT_COUNT = Gauge('awx_inventory_scripts_total', 'Number of invetory scripts')
USER_SESSIONS = Gauge('awx_sessions_total', 'Number of sessions', ['type',])
CUSTOM_VENVS = Gauge('awx_custom_virtualenvs_total', 'Number of virtualenvs')
RUNNING_JOBS = Gauge('awx_running_jobs_total', 'Number of running jobs on the Tower system')
INSTANCE_CAPACITY = Gauge('awx_instance_capacity', 'Capacity of each node in a Tower system', ['type',])
INSTANCE_CPU = Gauge('awx_instance_cpu', 'CPU cores on each node in a Tower system', ['type',])
INSTANCE_MEMORY = Gauge('awx_instance_memory', 'RAM (Kb) on each node in a Tower system', ['type',])
INSTANCE_INFO = Info('awx_instance', 'Info about each node in a Tower system', ['type',])
INSTANCE_LAUNCH_TYPE = Gauge('awx_instance_launch_type_total', 'Type of Job launched', ['node', 'launch_type',])
INSTANCE_STATUS = Gauge('awx_instance_status_total', 'Status of Job launched', ['node', 'status',])
def metrics():
license_info = get_license(show_key=False)
SYSTEM_INFO.info({'system_uuid': settings.SYSTEM_UUID,
'tower_url_base': settings.TOWER_URL_BASE,
'tower_version': get_awx_version(),
'ansible_version': get_ansible_version(),
'license_type': license_info.get('license_type', 'UNLICENSED'),
'free_instances': str(license_info.get('free instances', 0)),
'license_expiry': str(license_info.get('time_remaining', 0)),
'pendo_tracking': settings.PENDO_TRACKING_STATE,
'external_logger_enabled': str(settings.LOG_AGGREGATOR_ENABLED),
'external_logger_type': getattr(settings, 'LOG_AGGREGATOR_TYPE', 'None')})
current_counts = counts(datetime.now())
ORG_COUNT.set(current_counts['organization'])
USER_COUNT.set(current_counts['user'])
TEAM_COUNT.set(current_counts['team'])
INV_COUNT.set(current_counts['inventory'])
PROJ_COUNT.set(current_counts['project'])
JT_COUNT.set(current_counts['job_template'])
WFJT_COUNT.set(current_counts['workflow_job_template'])
HOST_COUNT.labels(type='all').set(current_counts['host'])
HOST_COUNT.labels(type='active').set(current_counts['active_host_count'])
SCHEDULE_COUNT.set(current_counts['schedule'])
INV_SCRIPT_COUNT.set(current_counts['custom_inventory_script'])
CUSTOM_VENVS.set(current_counts['custom_virtualenvs'])
USER_SESSIONS.labels(type='all').set(current_counts['active_sessions'])
USER_SESSIONS.labels(type='user').set(current_counts['active_user_sessions'])
USER_SESSIONS.labels(type='anonymous').set(current_counts['active_anonymous_sessions'])
RUNNING_JOBS.set(current_counts['running_jobs'])
instance_data = instance_info(datetime.now())
for uuid in instance_data:
INSTANCE_CAPACITY.labels(type=uuid).set(instance_data[uuid]['capacity'])
INSTANCE_CPU.labels(type=uuid).set(instance_data[uuid]['cpu'])
INSTANCE_MEMORY.labels(type=uuid).set(instance_data[uuid]['memory'])
INSTANCE_INFO.labels(type=uuid).info({'enabled': str(instance_data[uuid]['enabled']),
'last_isolated_check': getattr(instance_data[uuid], 'last_isolated_check', 'None'),
'managed_by_policy': str(instance_data[uuid]['managed_by_policy']),
'version': instance_data[uuid]['version']
})
instance_data = job_instance_counts(datetime.now())
for node in instance_data:
# skipping internal execution node (for system jobs)
# TODO: determine if we should exclude execution_node from instance count
if node == '':
continue
types = instance_data[node].get('launch_type', {})
for launch_type, value in types.items():
INSTANCE_LAUNCH_TYPE.labels(node=node, launch_type=launch_type).set(value)
statuses = instance_data[node].get('status', {})
for status, value in types.items():
INSTANCE_STATUS.labels(node=node, status=status).set(value)
return generate_latest()
__all__ = ['metrics']

49
docs/prometheus.md Normal file
View File

@ -0,0 +1,49 @@
# Prometheus Support
## Development
Starting a Prometheus container.
docker run --net=tools_default --link=tools_awx_1:awxweb --volume ./prometheus.yml:/prometheus.yml --name prometheus -d -p 127.0.0.1:9090:9090 prom/prometheus --web.enable-lifecycle --config.file=/prometheus.yml
Example Prometheus config.
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
# - alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'prometheus'
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ['localhost:9090']
- job_name: 'awx'
tls_config:
insecure_skip_verify: True
metrics_path: /api/v2/metrics
scrape_interval: 5s
scheme: https
params:
format: ['txt']
basic_auth:
username: root
password: reverse
# bearer_token: <token_value>
static_configs:
- targets:
- awxweb:8043

View File

@ -29,6 +29,7 @@ jsonschema==2.6.0
Markdown==2.6.11 # used for formatting API help
ordereddict==1.1
pexpect==4.6.0
prometheus_client==0.6.0
psutil==5.4.3
psycopg2==2.7.3.2 # problems with Segmentation faults / wheels on upgrade
pygerduty==0.37.0

View File

@ -74,6 +74,7 @@ oauthlib==2.0.6 # via django-oauth-toolkit, requests-oauthlib, social-
ordereddict==1.1
pexpect==4.6.0
pkgconfig==1.4.0 # via xmlsec
prometheus_client==0.6.0
psutil==5.4.3
psycopg2==2.7.3.2
ptyprocess==0.6.0 # via pexpect