diff --git a/awx/api/metrics.py b/awx/api/metrics.py new file mode 100644 index 0000000000..27552e4a4e --- /dev/null +++ b/awx/api/metrics.py @@ -0,0 +1,15 @@ +# Copyright (c) 2017 Ansible, Inc. +# All Rights Reserved. + +from django.conf.urls import url + +from awx.api.views import ( + MetricsView +) + + +urls = [ + url(r'^$', MetricsView.as_view(), name='metrics_view'), +] + +__all__ = ['urls'] diff --git a/awx/api/urls/urls.py b/awx/api/urls/urls.py index c5da931a69..4a8fb61b1f 100644 --- a/awx/api/urls/urls.py +++ b/awx/api/urls/urls.py @@ -34,6 +34,8 @@ from awx.api.views import ( OAuth2ApplicationDetail, ) +from awx.api.views.metrics import MetricsView + from .organization import urls as organization_urls from .user import urls as user_urls from .project import urls as project_urls @@ -133,6 +135,7 @@ v2_urls = [ url(r'^applications/(?P[0-9]+)/tokens/$', ApplicationOAuth2TokenList.as_view(), name='application_o_auth2_token_list'), url(r'^tokens/$', OAuth2TokenList.as_view(), name='o_auth2_token_list'), url(r'^', include(oauth2_urls)), + url(r'^metrics/$', MetricsView.as_view(), name='metrics_view'), ] app_name = 'api' diff --git a/awx/api/views/metrics.py b/awx/api/views/metrics.py new file mode 100644 index 0000000000..5646a16189 --- /dev/null +++ b/awx/api/views/metrics.py @@ -0,0 +1,46 @@ +# Copyright (c) 2018 Red Hat, Inc. +# All Rights Reserved. + +# Python +import logging + +# Django +from django.conf import settings +from django.utils.translation import ugettext_lazy as _ +from django.utils.timezone import now + +# Django REST Framework +from rest_framework.exceptions import PermissionDenied +from rest_framework.response import Response +from rest_framework.renderers import JSONRenderer, StaticHTMLRenderer + +# AWX +# from awx.main.analytics import collectors +from awx.main.analytics.metrics import metrics +from awx.api import renderers + +from awx.api.generics import ( + APIView, +) + +from awx.api.serializers import ( + InventorySerializer, + ActivityStreamSerializer, +) + +logger = logging.getLogger('awx.main.analytics') + + + +class MetricsView(APIView): + + view_name = _('Metrics') + swagger_topic = 'Metrics' + + renderer_classes = [renderers.PlainTextRenderer, + renderers.BrowsableAPIRenderer, + JSONRenderer,] + + def get(self, request, format='txt'): + ''' Show Metrics Details ''' + return Response(metrics().decode('UTF-8')) diff --git a/awx/api/views/root.py b/awx/api/views/root.py index 6f0822e0b9..3ee22c6673 100644 --- a/awx/api/views/root.py +++ b/awx/api/views/root.py @@ -104,6 +104,7 @@ class ApiVersionRootView(APIView): data['credential_input_sources'] = reverse('api:credential_input_source_list', request=request) data['applications'] = reverse('api:o_auth2_application_list', request=request) data['tokens'] = reverse('api:o_auth2_token_list', request=request) + data['metrics'] = reverse('api:metrics_view', request=request) data['inventory'] = reverse('api:inventory_list', request=request) data['inventory_scripts'] = reverse('api:inventory_script_list', request=request) data['inventory_sources'] = reverse('api:inventory_source_list', request=request) @@ -278,6 +279,3 @@ class ApiV1ConfigView(APIView): except Exception: # FIX: Log return Response({"error": _("Failed to remove license.")}, status=status.HTTP_400_BAD_REQUEST) - - - diff --git a/awx/main/analytics/collectors.py b/awx/main/analytics/collectors.py index ed6aeb819e..4c9f1d9c83 100644 --- a/awx/main/analytics/collectors.py +++ b/awx/main/analytics/collectors.py @@ -158,7 +158,7 @@ def instance_info(since): instances = models.Instance.objects.values_list('hostname').annotate().values( 'uuid', 'version', 'capacity', 'cpu', 'memory', 'managed_by_policy', 'hostname', 'last_isolated_check', 'enabled') for instance in instances: - info = {'uuid': instance['uuid'], + instance_info = {'uuid': instance['uuid'], 'version': instance['version'], 'capacity': instance['capacity'], 'cpu': instance['cpu'], @@ -167,6 +167,7 @@ def instance_info(since): 'last_isolated_check': instance['last_isolated_check'], 'enabled': instance['enabled'] } + info[instance['uuid']] = instance_info return info @@ -186,12 +187,12 @@ def job_instance_counts(since): job_types = models.UnifiedJob.objects.exclude(launch_type='sync').values_list( 'execution_node', 'launch_type').annotate(job_launch_type=Count('launch_type')) for job in job_types: - counts.setdefault(job[0], {}).setdefault('status', {})[job[1]] = job[2] + counts.setdefault(job[0], {}).setdefault('launch_type', {})[job[1]] = job[2] job_statuses = models.UnifiedJob.objects.exclude(launch_type='sync').values_list( 'execution_node', 'status').annotate(job_status=Count('status')) for job in job_statuses: - counts.setdefault(job[0], {}).setdefault('launch_type', {})[job[1]] = job[2] + counts.setdefault(job[0], {}).setdefault('status', {})[job[1]] = job[2] return counts diff --git a/awx/main/analytics/metrics.py b/awx/main/analytics/metrics.py new file mode 100644 index 0000000000..1183ab47b8 --- /dev/null +++ b/awx/main/analytics/metrics.py @@ -0,0 +1,127 @@ +import os +from datetime import datetime + +from prometheus_client import ( + REGISTRY, + PROCESS_COLLECTOR, + PLATFORM_COLLECTOR, + GC_COLLECTOR, + Gauge, + Info, + generate_latest +) + +from django.contrib.sessions.models import Session + +# Temporary Imports +from django.db import connection +from django.db.models import Count +from django.conf import settings + +from awx.conf.license import get_license +from awx.main.utils import (get_awx_version, get_ansible_version, + get_custom_venv_choices) +from awx.main import models +from awx.main.analytics.collectors import ( + counts, + instance_info, + job_instance_counts + ) +from django.contrib.sessions.models import Session +from awx.main.analytics import register + + +REGISTRY.unregister(PROCESS_COLLECTOR) +REGISTRY.unregister(PLATFORM_COLLECTOR) +REGISTRY.unregister(GC_COLLECTOR) + + +SYSTEM_INFO = Info('awx_system', 'AWX System Information') +ORG_COUNT = Gauge('awx_organizations_total', 'Number of organizations') +USER_COUNT = Gauge('awx_users_total', 'Number of users') +TEAM_COUNT = Gauge('awx_teams_total', 'Number of teams') +INV_COUNT = Gauge('awx_inventories_total', 'Number of inventories') +PROJ_COUNT = Gauge('awx_projects_total', 'Number of projects') +JT_COUNT = Gauge('awx_job_templates_total', 'Number of job templates') +WFJT_COUNT = Gauge('awx_workflow_job_templates_total', 'Number of workflow job templates') +HOST_COUNT = Gauge('awx_hosts_total', 'Number of hosts', ['type',]) +SCHEDULE_COUNT = Gauge('awx_schedules_total', 'Number of schedules') +INV_SCRIPT_COUNT = Gauge('awx_inventory_scripts_total', 'Number of invetory scripts') +USER_SESSIONS = Gauge('awx_sessions_total', 'Number of sessions', ['type',]) +CUSTOM_VENVS = Gauge('awx_custom_virtualenvs_total', 'Number of virtualenvs') +RUNNING_JOBS = Gauge('awx_running_jobs_total', 'Number of running jobs on the Tower system') + +INSTANCE_CAPACITY = Gauge('awx_instance_capacity', 'Capacity of each node in a Tower system', ['type',]) +INSTANCE_CPU = Gauge('awx_instance_cpu', 'CPU cores on each node in a Tower system', ['type',]) +INSTANCE_MEMORY = Gauge('awx_instance_memory', 'RAM (Kb) on each node in a Tower system', ['type',]) +INSTANCE_INFO = Info('awx_instance', 'Info about each node in a Tower system', ['type',]) +INSTANCE_LAUNCH_TYPE = Gauge('awx_instance_launch_type_total', 'Type of Job launched', ['node', 'launch_type',]) +INSTANCE_STATUS = Gauge('awx_instance_status_total', 'Status of Job launched', ['node', 'status',]) + + +def metrics(): + license_info = get_license(show_key=False) + SYSTEM_INFO.info({'system_uuid': settings.SYSTEM_UUID, + 'tower_url_base': settings.TOWER_URL_BASE, + 'tower_version': get_awx_version(), + 'ansible_version': get_ansible_version(), + 'license_type': license_info.get('license_type', 'UNLICENSED'), + 'free_instances': str(license_info.get('free instances', 0)), + 'license_expiry': str(license_info.get('time_remaining', 0)), + 'pendo_tracking': settings.PENDO_TRACKING_STATE, + 'external_logger_enabled': str(settings.LOG_AGGREGATOR_ENABLED), + 'external_logger_type': getattr(settings, 'LOG_AGGREGATOR_TYPE', 'None')}) + + current_counts = counts(datetime.now()) + + ORG_COUNT.set(current_counts['organization']) + USER_COUNT.set(current_counts['user']) + TEAM_COUNT.set(current_counts['team']) + INV_COUNT.set(current_counts['inventory']) + PROJ_COUNT.set(current_counts['project']) + JT_COUNT.set(current_counts['job_template']) + WFJT_COUNT.set(current_counts['workflow_job_template']) + + HOST_COUNT.labels(type='all').set(current_counts['host']) + HOST_COUNT.labels(type='active').set(current_counts['active_host_count']) + + SCHEDULE_COUNT.set(current_counts['schedule']) + INV_SCRIPT_COUNT.set(current_counts['custom_inventory_script']) + CUSTOM_VENVS.set(current_counts['custom_virtualenvs']) + + USER_SESSIONS.labels(type='all').set(current_counts['active_sessions']) + USER_SESSIONS.labels(type='user').set(current_counts['active_user_sessions']) + USER_SESSIONS.labels(type='anonymous').set(current_counts['active_anonymous_sessions']) + + RUNNING_JOBS.set(current_counts['running_jobs']) + + + instance_data = instance_info(datetime.now()) + for uuid in instance_data: + INSTANCE_CAPACITY.labels(type=uuid).set(instance_data[uuid]['capacity']) + INSTANCE_CPU.labels(type=uuid).set(instance_data[uuid]['cpu']) + INSTANCE_MEMORY.labels(type=uuid).set(instance_data[uuid]['memory']) + INSTANCE_INFO.labels(type=uuid).info({'enabled': str(instance_data[uuid]['enabled']), + 'last_isolated_check': getattr(instance_data[uuid], 'last_isolated_check', 'None'), + 'managed_by_policy': str(instance_data[uuid]['managed_by_policy']), + 'version': instance_data[uuid]['version'] + }) + + instance_data = job_instance_counts(datetime.now()) + for node in instance_data: + # skipping internal execution node (for system jobs) + # TODO: determine if we should exclude execution_node from instance count + if node == '': + continue + types = instance_data[node].get('launch_type', {}) + for launch_type, value in types.items(): + INSTANCE_LAUNCH_TYPE.labels(node=node, launch_type=launch_type).set(value) + statuses = instance_data[node].get('status', {}) + for status, value in types.items(): + INSTANCE_STATUS.labels(node=node, status=status).set(value) + + + return generate_latest() + + +__all__ = ['metrics'] diff --git a/docs/prometheus.md b/docs/prometheus.md new file mode 100644 index 0000000000..09d92724b5 --- /dev/null +++ b/docs/prometheus.md @@ -0,0 +1,49 @@ +# Prometheus Support + +## Development + +Starting a Prometheus container. + + docker run --net=tools_default --link=tools_awx_1:awxweb --volume ./prometheus.yml:/prometheus.yml --name prometheus -d -p 127.0.0.1:9090:9090 prom/prometheus --web.enable-lifecycle --config.file=/prometheus.yml + +Example Prometheus config. + + # my global config + global: + scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. + evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. + # scrape_timeout is set to the global default (10s). + # Alertmanager configuration + alerting: + alertmanagers: + - static_configs: + - targets: + # - alertmanager:9093 + # Load rules once and periodically evaluate them according to the global 'evaluation_interval'. + rule_files: + # - "first_rules.yml" + # - "second_rules.yml" + # A scrape configuration containing exactly one endpoint to scrape: + # Here it's Prometheus itself. + scrape_configs: + # The job name is added as a label `job=` to any timeseries scraped from this config. + - job_name: 'prometheus' + # metrics_path defaults to '/metrics' + # scheme defaults to 'http'. + static_configs: + - targets: ['localhost:9090'] + - job_name: 'awx' + tls_config: + insecure_skip_verify: True + metrics_path: /api/v2/metrics + scrape_interval: 5s + scheme: https + params: + format: ['txt'] + basic_auth: + username: root + password: reverse + # bearer_token: + static_configs: + - targets: + - awxweb:8043 diff --git a/requirements/requirements.in b/requirements/requirements.in index 0fa2258ebf..b03208a3ed 100644 --- a/requirements/requirements.in +++ b/requirements/requirements.in @@ -29,6 +29,7 @@ jsonschema==2.6.0 Markdown==2.6.11 # used for formatting API help ordereddict==1.1 pexpect==4.6.0 +prometheus_client==0.6.0 psutil==5.4.3 psycopg2==2.7.3.2 # problems with Segmentation faults / wheels on upgrade pygerduty==0.37.0 diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 7816cfe85c..cd8533fc53 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -74,6 +74,7 @@ oauthlib==2.0.6 # via django-oauth-toolkit, requests-oauthlib, social- ordereddict==1.1 pexpect==4.6.0 pkgconfig==1.4.0 # via xmlsec +prometheus_client==0.6.0 psutil==5.4.3 psycopg2==2.7.3.2 ptyprocess==0.6.0 # via pexpect