Socle V004 – Métriques

Socle V004 - Métriques

15 – Metrics

Version : 4.0.0 Date : 2025-12-09

1. Introduction

Le Socle V4 expose des métriques au format Prometheus pour le monitoring et l’alerting.

Types de métriques

  • Counter : Valeur qui ne fait qu’augmenter (requêtes, erreurs)
  • Gauge : Valeur qui peut monter et descendre (connexions actives)
  • Histogram : Distribution de valeurs (latences)
  • Summary : Similaire à histogram avec percentiles pré-calculés

2. Configuration

2.1 application.yml

management:
  endpoints:
    web:
      exposure:
        include: prometheus,health,info,metrics
      base-path: /actuator
  endpoint:
    prometheus:
      enabled: true
  metrics:
    export:
      prometheus:
        enabled: true
    tags:
      application: ${socle.app_name}
      environment: ${socle.env_name}
      region: ${socle.region}

2.2 Dépendances Maven

<dependency>
    <groupId>io.micrometer</groupId>
    <artifactId>micrometer-registry-prometheus</artifactId>
</dependency>

3. Métriques Socle

3.1 Métriques Workers

# Nombre de workers
socle_workers_total{application="socle-v4"} 5

# Workers healthy
socle_workers_healthy{application="socle-v4"} 5

# Workers unhealthy
socle_workers_unhealthy{application="socle-v4"} 0

# État par worker
socle_worker_status{worker="kafka-consumer",status="RUNNING"} 1
socle_worker_status{worker="order-processor",status="RUNNING"} 1

# Heartbeats par worker
socle_worker_heartbeats_total{worker="kafka-consumer"} 1234
socle_worker_missed_heartbeats{worker="kafka-consumer"} 0

3.2 Métriques KvBus

# Opérations
socle_kvbus_operations_total{operation="get"} 12345
socle_kvbus_operations_total{operation="put"} 6789
socle_kvbus_operations_total{operation="delete"} 234

# Latence
socle_kvbus_operation_duration_seconds{operation="get",quantile="0.5"} 0.001
socle_kvbus_operation_duration_seconds{operation="get",quantile="0.95"} 0.005
socle_kvbus_operation_duration_seconds{operation="get",quantile="0.99"} 0.01

# Nombre de clés
socle_kvbus_keys_count 456

3.3 Métriques Pipeline

# Exécutions
socle_pipeline_executions_total{pipeline="order-processing",status="SUCCESS"} 1234
socle_pipeline_executions_total{pipeline="order-processing",status="FAILURE"} 12

# Durée
socle_pipeline_duration_seconds{pipeline="order-processing",quantile="0.5"} 0.5
socle_pipeline_duration_seconds{pipeline="order-processing",quantile="0.95"} 2.0
socle_pipeline_duration_seconds{pipeline="order-processing",quantile="0.99"} 5.0

# Steps
socle_pipeline_step_duration_seconds{step="validation",quantile="0.5"} 0.01
socle_pipeline_step_duration_seconds{step="processing",quantile="0.5"} 0.3

3.4 Métriques Resilience

# Circuit breaker état (0=CLOSED, 1=HALF_OPEN, 2=OPEN)
socle_circuit_breaker_state{name="payment-gateway"} 0

# Tentatives de retry
socle_retry_attempts_total{operation="external-api",attempt="1",success="true"} 1000
socle_retry_attempts_total{operation="external-api",attempt="2",success="true"} 50
socle_retry_attempts_total{operation="external-api",attempt="3",success="false"} 5

3.5 Métriques TechDB (V4)

# Opérations
socle_techdb_operations_total{operation="saveOffset"} 5678
socle_techdb_operations_total{operation="getOffset"} 12345

# Taille des tables
socle_techdb_rows_count{table="socle_offsets"} 23
socle_techdb_rows_count{table="socle_events"} 456
socle_techdb_rows_count{table="socle_log_fallback"} 0

3.6 Métriques LogForwarder (V4)

# Queue
socle_logforwarder_queue_size 45
socle_logforwarder_queue_capacity 10000

# Logs envoyés
socle_logforwarder_logs_sent_total 123456
socle_logforwarder_logs_failed_total 23
socle_logforwarder_logs_fallback_total 0

# Batches
socle_logforwarder_batches_sent_total 1234
socle_logforwarder_batch_size{quantile="0.5"} 100

4. Implémentation

4.1 Enregistrement des métriques

package eu.lmvi.socle.metrics;

@Component
public class SocleMetrics {

    private final MeterRegistry registry;

    // Counters
    private final Counter requestsTotal;
    private final Counter errorsTotal;

    // Gauges
    private final AtomicInteger activeConnections = new AtomicInteger(0);

    // Timers
    private final Timer requestDuration;

    public SocleMetrics(MeterRegistry registry) {
        this.registry = registry;

        // Counter
        this.requestsTotal = Counter.builder("socle_requests_total")
            .description("Total number of requests")
            .register(registry);

        this.errorsTotal = Counter.builder("socle_errors_total")
            .description("Total number of errors")
            .register(registry);

        // Gauge
        Gauge.builder("socle_active_connections", activeConnections, AtomicInteger::get)
            .description("Number of active connections")
            .register(registry);

        // Timer
        this.requestDuration = Timer.builder("socle_request_duration_seconds")
            .description("Request duration in seconds")
            .publishPercentiles(0.5, 0.95, 0.99)
            .register(registry);
    }

    public void recordRequest() {
        requestsTotal.increment();
    }

    public void recordError() {
        errorsTotal.increment();
    }

    public void connectionOpened() {
        activeConnections.incrementAndGet();
    }

    public void connectionClosed() {
        activeConnections.decrementAndGet();
    }

    public Timer.Sample startTimer() {
        return Timer.start(registry);
    }

    public void stopTimer(Timer.Sample sample) {
        sample.stop(requestDuration);
    }
}

4.2 Utilisation dans le code

@Service
public class OrderService {

    @Autowired
    private SocleMetrics metrics;

    public Order processOrder(Order order) {
        Timer.Sample sample = metrics.startTimer();
        metrics.recordRequest();

        try {
            Order result = doProcess(order);
            return result;
        } catch (Exception e) {
            metrics.recordError();
            throw e;
        } finally {
            metrics.stopTimer(sample);
        }
    }
}

4.3 Métriques avec tags

@Component
public class WorkerMetrics {

    private final MeterRegistry registry;

    public void recordWorkerStatus(String workerName, String status) {
        Gauge.builder("socle_worker_status", () -> 1)
            .tag("worker", workerName)
            .tag("status", status)
            .register(registry);
    }

    public void recordProcessed(String workerName, String type) {
        Counter.builder("socle_worker_processed_total")
            .tag("worker", workerName)
            .tag("type", type)
            .register(registry)
            .increment();
    }
}

5. Endpoint Prometheus

5.1 Accès

curl http://localhost:8080/actuator/prometheus

5.2 Sortie

# HELP socle_workers_total Number of workers
# TYPE socle_workers_total gauge
socle_workers_total{application="socle-v4",environment="PROD",region="MTQ"} 5

# HELP socle_workers_healthy Number of healthy workers
# TYPE socle_workers_healthy gauge
socle_workers_healthy{application="socle-v4",environment="PROD",region="MTQ"} 5

# HELP socle_requests_total Total number of requests
# TYPE socle_requests_total counter
socle_requests_total{application="socle-v4",environment="PROD",region="MTQ"} 12345

# HELP socle_request_duration_seconds Request duration in seconds
# TYPE socle_request_duration_seconds summary
socle_request_duration_seconds{application="socle-v4",quantile="0.5"} 0.05
socle_request_duration_seconds{application="socle-v4",quantile="0.95"} 0.2
socle_request_duration_seconds{application="socle-v4",quantile="0.99"} 0.5
socle_request_duration_seconds_count{application="socle-v4"} 12345
socle_request_duration_seconds_sum{application="socle-v4"} 617.25

6. Prometheus Configuration

6.1 prometheus.yml

global:
  scrape_interval: 15s

scrape_configs:
  - job_name: 'socle-v4'
    metrics_path: '/actuator/prometheus'
    static_configs:
      - targets: ['socle-app:8080']
        labels:
          app: 'socle-v4'
          env: 'prod'

  - job_name: 'socle-v4-kubernetes'
    kubernetes_sd_configs:
      - role: pod
    relabel_configs:
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
        action: keep
        regex: true
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
        action: replace
        target_label: __metrics_path__
        regex: (.+)

6.2 Kubernetes annotations

apiVersion: v1
kind: Pod
metadata:
  annotations:
    prometheus.io/scrape: "true"
    prometheus.io/path: "/actuator/prometheus"
    prometheus.io/port: "8080"

7. Grafana Dashboards

7.1 Exemple de requêtes

# Taux de requêtes par seconde
rate(socle_requests_total[5m])

# Taux d'erreurs
rate(socle_errors_total[5m]) / rate(socle_requests_total[5m]) * 100

# Latence P95
histogram_quantile(0.95, rate(socle_request_duration_seconds_bucket[5m]))

# Workers unhealthy
socle_workers_unhealthy

# Circuit breakers ouverts
socle_circuit_breaker_state == 2

# Queue LogForwarder
socle_logforwarder_queue_size / socle_logforwarder_queue_capacity * 100

7.2 Dashboard JSON

{
  "title": "Socle V4 Dashboard",
  "panels": [
    {
      "title": "Request Rate",
      "type": "graph",
      "targets": [
        {
          "expr": "rate(socle_requests_total[5m])",
          "legendFormat": "{{application}}"
        }
      ]
    },
    {
      "title": "Error Rate",
      "type": "graph",
      "targets": [
        {
          "expr": "rate(socle_errors_total[5m]) / rate(socle_requests_total[5m]) * 100",
          "legendFormat": "Error %"
        }
      ]
    },
    {
      "title": "P95 Latency",
      "type": "graph",
      "targets": [
        {
          "expr": "histogram_quantile(0.95, rate(socle_request_duration_seconds_bucket[5m]))",
          "legendFormat": "P95"
        }
      ]
    },
    {
      "title": "Workers Status",
      "type": "stat",
      "targets": [
        {
          "expr": "socle_workers_healthy",
          "legendFormat": "Healthy"
        }
      ]
    }
  ]
}

8. Alerting

8.1 Prometheus Alertmanager rules

groups:
  - name: socle-alerts
    rules:
      - alert: SocleHighErrorRate
        expr: rate(socle_errors_total[5m]) / rate(socle_requests_total[5m]) > 0.05
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High error rate on {{ $labels.application }}"
          description: "Error rate is {{ $value | humanizePercentage }}"

      - alert: SocleWorkerUnhealthy
        expr: socle_workers_unhealthy > 0
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Unhealthy workers on {{ $labels.application }}"
          description: "{{ $value }} workers are unhealthy"

      - alert: SocleCircuitBreakerOpen
        expr: socle_circuit_breaker_state == 2
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Circuit breaker {{ $labels.name }} is OPEN"

      - alert: SocleLogForwarderQueueHigh
        expr: socle_logforwarder_queue_size / socle_logforwarder_queue_capacity > 0.8
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "LogForwarder queue is {{ $value | humanizePercentage }} full"

9. Bonnes pratiques

DO

  • Utiliser des noms de métriques cohérents (socle_*)
  • Ajouter des tags pertinents (application, environment, region)
  • Utiliser des histogrammes pour les latences
  • Définir des alertes sur les métriques critiques
  • Documenter les métriques

DON’T

  • Ne pas créer trop de métriques (cardinalité)
  • Ne pas utiliser de valeurs à haute cardinalité dans les tags
  • Ne pas oublier les métriques d’erreur
  • Ne pas ignorer les métriques de queue/buffer

10. Références

Commentaires

Laisser un commentaire

Votre adresse e-mail ne sera pas publiée. Les champs obligatoires sont indiqués avec *