feat(observability): add local grafana+prom stack for metrics insights

2026-01-13 13:11:02 +00:00 · 2020-11-22 20:04:56 +01:00 · 2020-11-22 20:04:56 +01:00 · d0a9b0a07c
commit d0a9b0a07c
parent 6b1640b753
8 changed files with 295 additions and 0 deletions
--- a/observability/docker-compose.yml
+++ b/observability/docker-compose.yml
@ -0,0 +1,86 @@
+# Adopted from https://github.com/stefanprodan/dockprom/blob/master/docker-compose.yml
+
+version: "3.4"
+
+services:
+  prometheus:
+    image: prom/prometheus:v2.33.4
+    volumes:
+      - ./prometheus:/etc/prometheus
+      - prometheus_data:/prometheus
+    command:
+      - "--config.file=/etc/prometheus/prometheus.yml"
+      - "--storage.tsdb.path=/prometheus"
+      - "--storage.tsdb.retention.time=200h"
+      - "--web.enable-lifecycle"
+    ports:
+      - 9090:9090
+    networks:
+      - observability
+      - api
+
+  loki:
+    image: grafana/loki:2.5.0
+    command: ["-config.file=/etc/loki/loki.yaml"]
+    ports:
+      - "3100" # loki needs to be exposed so it receives logs
+    volumes:
+      - ./loki/loki.yaml:/etc/loki/loki.yaml
+    networks:
+      - observability
+
+  promtail:
+    image: grafana/promtail:2.5.0
+    command: ["-config.file=/etc/promtail.yaml"]
+    volumes:
+      - ./promtail/promtail.yaml:/etc/promtail.yaml
+      - /var/lib/docker/containers:/var/lib/docker/containers:ro
+
+      - /var/log/journal/:/var/log/journal/
+      - /run/log/journal/:/run/log/journal/
+      - /etc/machine-id:/etc/machine-id
+    ports:
+      - "3102"
+    networks:
+      - observability
+    depends_on:
+      - loki
+
+  tempo:
+    image: grafana/tempo:1.3.2
+    command: ["-config.file=/etc/tempo.yaml"]
+    volumes:
+      - ./tempo/tempo.yaml:/etc/tempo.yaml
+      - tempo_data:/tmp/tempo
+    ports:
+      - "55680" # OpenTelemetry
+      - "3101" # tempo
+    networks:
+      - observability
+      - api
+
+  grafana:
+    image: grafana/grafana-oss:8.4.2
+    volumes:
+      - ./grafana/provisioning:/etc/grafana/provisioning
+    environment:
+      - GF_SECURITY_ADMIN_USER=listory
+      - GF_SECURITY_ADMIN_PASSWORD=listory
+      - GF_AUTH_ANONYMOUS_ENABLED=true
+      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
+      - GF_USERS_ALLOW_SIGN_UP=false
+      - GF_SERVER_HTTP_PORT=2345
+    ports:
+      - 2345:2345
+    networks:
+      - observability
+
+volumes:
+  prometheus_data: {}
+  tempo_data: {}
+
+networks:
+  observability: {}
+  api:
+    external: true
+    name: listory_web
--- a/observability/grafana/provisioning/dashboards/dashboard.yml
+++ b/observability/grafana/provisioning/dashboards/dashboard.yml
@ -0,0 +1,12 @@
+apiVersion: 1
+
+providers:
+  - name: "Prometheus"
+    orgId: 1
+    folder: ""
+    type: file
+    disableDeletion: false
+    editable: true
+    allowUiUpdates: true
+    options:
+      path: /etc/grafana/provisioning/dashboards
--- a/observability/grafana/provisioning/datasources/datasource.yml
+++ b/observability/grafana/provisioning/datasources/datasource.yml
@ -0,0 +1,41 @@
+apiVersion: 1
+
+datasources:
+  - name: Prometheus
+    type: prometheus
+    access: proxy
+    orgId: 1
+    url: http://prometheus:9090
+    basicAuth: false
+    isDefault: false
+    version: 1
+    editable: false
+
+  - name: Tempo
+    type: tempo
+    access: proxy
+    orgId: 1
+    url: http://tempo:3101
+    basicAuth: false
+    isDefault: false
+    version: 1
+    editable: true
+    apiVersion: 1
+    uid: tempo
+
+  - name: Loki
+    type: loki
+    access: proxy
+    orgId: 1
+    url: http://loki:3100
+    basicAuth: false
+    isDefault: false
+    version: 1
+    editable: false
+    apiVersion: 1
+    jsonData:
+      derivedFields:
+        - datasourceUid: tempo
+          matcherRegex: '"traceId":"([A-Za-z0-9]+)"'
+          name: TraceID
+          url: $${__value.raw}
--- a/observability/loki/loki.yaml
+++ b/observability/loki/loki.yaml
@ -0,0 +1,66 @@
+auth_enabled: false
+
+server:
+  http_listen_port: 3100
+
+ingester:
+  lifecycler:
+    address: 127.0.0.1
+    ring:
+      kvstore:
+        store: inmemory
+      replication_factor: 1
+    final_sleep: 0s
+  chunk_idle_period: 1h # Any chunk not receiving new logs in this time will be flushed
+  max_chunk_age: 1h # All chunks will be flushed when they hit this age, default is 1h
+  chunk_target_size: 1048576 # Loki will attempt to build chunks up to 1.5MB, flushing first if chunk_idle_period or max_chunk_age is reached first
+  chunk_retain_period: 30s # Must be greater than index read cache TTL if using an index cache (Default index read cache TTL is 5m)
+  max_transfer_retries: 0 # Chunk transfers disabled
+
+  wal:
+    dir: /loki/wal
+
+schema_config:
+  configs:
+    - from: 2020-10-24
+      store: boltdb-shipper
+      object_store: filesystem
+      schema: v11
+      index:
+        prefix: index_
+        period: 24h
+
+storage_config:
+  boltdb_shipper:
+    active_index_directory: /tmp/loki/boltdb-shipper-active
+    cache_location: /tmp/loki/boltdb-shipper-cache
+    cache_ttl: 24h # Can be increased for faster performance over longer query periods, uses more disk space
+    shared_store: filesystem
+  filesystem:
+    directory: /tmp/loki/chunks
+
+compactor:
+  working_directory: /tmp/loki/boltdb-shipper-compactor
+  shared_store: filesystem
+
+limits_config:
+  reject_old_samples: true
+  reject_old_samples_max_age: 168h
+
+chunk_store_config:
+  max_look_back_period: 0s
+
+table_manager:
+  retention_deletes_enabled: false
+  retention_period: 0s
+
+ruler:
+  storage:
+    type: local
+    local:
+      directory: /tmp/loki/rules
+  rule_path: /tmp/loki/rules-temp
+  ring:
+    kvstore:
+      store: inmemory
+  enable_api: true
--- a/observability/prometheus/prometheus.yml
+++ b/observability/prometheus/prometheus.yml
@ -0,0 +1,17 @@
+global:
+  scrape_interval: 15s
+
+# A scrape configuration containing exactly one endpoint to scrape.
+scrape_configs:
+  - job_name: "listory"
+    metrics_path: "/metrics"
+    static_configs:
+      - targets: ["api:9464"]
+
+  - job_name: "prometheus"
+    static_configs:
+      - targets: ["localhost:9090"]
+
+  - job_name: "tempo"
+    static_configs:
+      - targets: ["tempo:3100"]
--- a/observability/promtail/promtail.yaml
+++ b/observability/promtail/promtail.yaml
@ -0,0 +1,30 @@
+server:
+  http_listen_port: 3102
+
+clients:
+  - url: http://loki:3100/loki/api/v1/push
+
+positions:
+  filename: /tmp/positions.yaml
+
+target_config:
+  sync_period: 10s
+
+scrape_configs:
+  - job_name: listory
+    journal:
+      labels:
+        job: listory
+    relabel_configs:
+      # services
+      - source_labels:
+          - __journal__systemd_unit
+        target_label: unit
+      # docker containers
+      - source_labels:
+          - __journal_container_name
+        target_label: container # use whatever label you like
+      - source_labels:
+          - container
+        action: keep
+        regex: listory-.* # only keep api logs
--- a/observability/tempo/tempo.yaml
+++ b/observability/tempo/tempo.yaml
@ -0,0 +1,36 @@
+server:
+  http_listen_port: 3101
+
+distributor:
+  receivers:
+    otlp:
+      protocols:
+        http:
+
+ingester:
+  trace_idle_period: 10s # the length of time after a trace has not received spans to consider it complete and flush it
+  max_block_bytes: 1_000_000 # cut the head block when it hits this size or ...
+  max_block_duration: 5m #   this much time passes
+
+compactor:
+  compaction:
+    compaction_window: 1h # blocks in this time window will be compacted together
+    max_block_bytes: 100_000_000 # maximum size of compacted blocks
+    block_retention: 1h
+    compacted_block_retention: 10m
+
+storage:
+  trace:
+    backend: local # backend configuration to use
+    block:
+      bloom_filter_false_positive: .05 # bloom filter false positive rate.  lower values create larger filters but fewer false positives
+      index_downsample_bytes: 1000 # number of bytes per index record
+      encoding: zstd # block encoding/compression.  options: none, gzip, lz4-64k, lz4-256k, lz4-1M, lz4, snappy, zstd
+    wal:
+      path: /tmp/tempo/wal # where to store the the wal locally
+      encoding: none # wal encoding/compression.  options: none, gzip, lz4-64k, lz4-256k, lz4-1M, lz4, snappy, zstd
+    local:
+      path: /tmp/tempo/blocks
+    pool:
+      max_workers: 100 # the worker pool mainly drives querying, but is also used for polling the blocklist
+      queue_depth: 10000