Centralized logging via otel

2026-03-19 01:47:31 -02:30 · 2024-05-13 11:16:17 -04:00
parent d0fe0ed796
commit 0eb465531c
11 changed files with 270 additions and 0 deletions
--- a/tools/docker-compose/README.md
+++ b/tools/docker-compose/README.md
@@ -613,3 +613,13 @@ docker exec -it -e VAULT_TOKEN=<token> tools_vault_1 vault kv get --address=http
 ### Prometheus and Grafana integration

 See docs at https://github.com/ansible/awx/blob/devel/tools/grafana/README.md
+
+### OpenTelemetry Integration
+
+```bash
+OTEL=true GRAFANA=true LOKI=true PROMETHEUS=true make docker-compose
+```
+
+This will start the sidecar container `tools_otel_1` and configure AWX logging to send to it. The OpenTelemetry Collector is configured to export logs to Loki. Grafana is configured with Loki as a datasource. AWX logs can be viewed in Grafana.
+
+`http://localhost:3001` grafana
--- a/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2
+++ b/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2
@@ -269,6 +269,42 @@ services:
      # pg_notify will NOT work in transaction mode.
      PGBOUNCER_POOL_MODE: session
 {% endif %}
+{% if enable_otel|bool %}
+  otel:
+    image: otel/opentelemetry-collector-contrib:0.88.0
+    container_name: tools_otel_1
+    hostname: otel
+    command: ["--config=/etc/otel-collector-config.yaml", ""]
+    networks:
+      - awx
+    ports:
+      - "4317:4317"   # OTLP gRPC receiver
+      - "4318:4318"   # OTLP http receiver
+      - "55679:55679" # zpages http://localhost:55679/debug/servicez /tracez
+    volumes:
+      - "../../otel/otel-collector-config.yaml:/etc/otel-collector-config.yaml"
+    depends_on:
+      - loki
+{% endif %}
+{% if enable_loki|bool %}
+  loki:
+    image: grafana/loki:2.9.5
+    container_name: tools_loki_1
+    hostname: loki
+    ports:
+      - "3100:3100"
+    command: -config.file=/etc/loki/local-config.yaml
+    networks:
+      - awx
+    volumes:
+      - "loki_storage:/loki:rw"
+      #- "../../docker-compose/loki/volumes/index:/loki/index"
+      #- "../../docker-compose/loki/volumes/boltdb-cache:/loki/boltdb-cache"
+      - "../../loki/local-config.yaml:/etc/loki/local-config.yaml"
+    depends_on:
+      - grafana
+{% endif %}
+
 {% if execution_node_count|int > 0 %}
  receptor-hop:
    image: {{ receptor_image }}
@@ -360,6 +396,10 @@ volumes:
  grafana_storage:
    name: tools_grafana_storage
 {% endif %}
+{% if enable_loki|bool %}
+  loki_storage:
+    name: tools_loki_storage
+{% endif %}

 networks:
  awx:
--- a/tools/docker-compose/ansible/roles/sources/templates/local_settings.py.j2
+++ b/tools/docker-compose/ansible/roles/sources/templates/local_settings.py.j2
@@ -46,6 +46,18 @@ OPTIONAL_API_URLPATTERN_PREFIX = '{{ api_urlpattern_prefix }}'
 # LOGGING['loggers']['django_auth_ldap']['handlers'] = ['console']
 # LOGGING['loggers']['django_auth_ldap']['level'] = 'DEBUG'

+{% if enable_otel|bool %}
+LOGGING['handlers']['otel'] |= {
+    'class': 'awx.main.utils.handlers.OTLPHandler',
+    'endpoint': 'http://otel:4317',
+}
+# Add otel log handler to all log handlers
+for name in LOGGING['loggers'].keys():
+    handler = LOGGING['loggers'][name].get('handlers', [])
+    if 'otel' not in handler:
+        LOGGING['loggers'][name].get('handlers', []).append('otel')
+{% endif %}
+
 BROADCAST_WEBSOCKET_PORT = 8013
 BROADCAST_WEBSOCKET_VERIFY_CERT = False
 BROADCAST_WEBSOCKET_PROTOCOL = 'http'
--- a/tools/grafana/datasources/loki_source.yml
+++ b/tools/grafana/datasources/loki_source.yml
@@ -0,0 +1,11 @@
+---
+apiVersion: 1
+
+datasources:
+  - name: Loki
+    type: loki
+    access: proxy
+    url: http://loki:3100
+    jsonData:
+      timeout: 60
+      maxLines: 100000
--- a/tools/loki/local-config.yaml
+++ b/tools/loki/local-config.yaml
@@ -0,0 +1,96 @@
+auth_enabled: false
+
+server:
+  http_listen_port: 3100
+  grpc_server_max_recv_msg_size: 524288000 # 500 MB
+  grpc_server_max_send_msg_size: 524288000 # 500 MB, might be too much, be careful
+
+frontend_worker:
+  match_max_concurrent: true
+  grpc_client_config:
+    max_send_msg_size: 524288000 # 500 MB
+
+
+ingester:
+  max_chunk_age: 8766h
+
+common:
+  path_prefix: /loki
+  storage:
+    filesystem:
+      chunks_directory: /loki/chunks
+      rules_directory: /loki/rules
+  replication_factor: 1
+  ring:
+    kvstore:
+      store: inmemory
+
+# compactor:
+#   retention_enabled: true
+#   # cmeyers: YOLO. 1s seems wrong but it works so right
+#   compaction_interval: 1s # default 10m
+
+schema_config:
+  configs:
+    - from: 2020-10-24
+      store: boltdb-shipper
+      object_store: filesystem
+      schema: v11
+      index:
+        prefix: index_
+        period: 24h
+
+storage_config:
+  boltdb_shipper:
+    active_index_directory: /loki/index
+    cache_location: /loki/boltdb-cache
+
+ruler:
+  alertmanager_url: http://localhost:9093
+
+limits_config:
+  retention_period: 3y
+  # cmeyers: The default of 30m triggers a loop of queries that take a long time
+  # to complete and the UI times out
+  split_queries_by_interval: 1d
+  # cmeyers: Default of 30d1h limits grafana time queries. Can't, for example,
+  # query last 90 days
+  max_query_length: 3y
+  # cmeyers: Made the batch post request succeed.
+  reject_old_samples: false
+  reject_old_samples_max_age: 365d
+
+  ingestion_rate_mb: 32
+  ingestion_burst_size_mb: 32
+  per_stream_rate_limit: 32M
+  per_stream_rate_limit_burst: 32M
+  ingestion_rate_strategy: local # Default: global
+  max_global_streams_per_user: 100000000
+  max_entries_limit_per_query: 100000000
+  max_query_series: 1000000
+  max_query_parallelism: 32 # Old Default: 14
+  max_streams_per_user: 100000000 # Old Default: 10000
+
+# Taken from aap-log-visualizer
+frontend:
+  max_outstanding_per_tenant: 2048
+
+query_scheduler:
+  max_outstanding_requests_per_tenant: 2048
+
+query_range:
+  parallelise_shardable_queries: false
+  split_queries_by_interval: 0
+
+# By default, Loki will send anonymous, but uniquely-identifiable usage and configuration
+# analytics to Grafana Labs. These statistics are sent to https://stats.grafana.org/
+#
+# Statistics help us better understand how Loki is used, and they show us performance
+# levels for most users. This helps us prioritize features and documentation.
+# For more information on what's sent, look at
+# https://github.com/grafana/loki/blob/main/pkg/usagestats/stats.go
+# Refer to the buildReport method to see what goes into a report.
+#
+# If you would like to disable reporting, uncomment the following lines:
+#analytics:
+#  reporting_enabled: false
--- a/tools/otel/otel-collector-config.yaml
+++ b/tools/otel/otel-collector-config.yaml
@@ -0,0 +1,39 @@
+receivers:
+  otlp:
+    protocols:
+      grpc:
+
+exporters:
+  debug:
+    verbosity: detailed
+
+  loki:
+    endpoint: http://loki:3100/loki/api/v1/push
+    tls:
+      insecure: true
+    headers:
+      "X-Scope-OrgID": "1"
+    default_labels_enabled:
+      exporter: true
+      job: true
+      instance: true
+      level: true
+
+processors:
+  batch:
+
+extensions:
+  health_check:
+  zpages:
+    endpoint: ":55679"
+
+service:
+  pipelines:
+    logs:
+      receivers: [otlp]
+      processors: [batch]
+      exporters: [loki]
+
+  extensions:
+    - health_check
+    - zpages