15 – Metrics
Version : 4.0.0 Date : 2025-12-09
1. Introduction
Le Socle V4 expose des métriques au format Prometheus pour le monitoring et l’alerting.
Types de métriques
- Counter : Valeur qui ne fait qu’augmenter (requêtes, erreurs)
- Gauge : Valeur qui peut monter et descendre (connexions actives)
- Histogram : Distribution de valeurs (latences)
- Summary : Similaire à histogram avec percentiles pré-calculés
2. Configuration
2.1 application.yml
management:
endpoints:
web:
exposure:
include: prometheus,health,info,metrics
base-path: /actuator
endpoint:
prometheus:
enabled: true
metrics:
export:
prometheus:
enabled: true
tags:
application: ${socle.app_name}
environment: ${socle.env_name}
region: ${socle.region}
2.2 Dépendances Maven
<dependency>
<groupId>io.micrometer</groupId>
<artifactId>micrometer-registry-prometheus</artifactId>
</dependency>
3. Métriques Socle
3.1 Métriques Workers
# Nombre de workers
socle_workers_total{application="socle-v4"} 5
# Workers healthy
socle_workers_healthy{application="socle-v4"} 5
# Workers unhealthy
socle_workers_unhealthy{application="socle-v4"} 0
# État par worker
socle_worker_status{worker="kafka-consumer",status="RUNNING"} 1
socle_worker_status{worker="order-processor",status="RUNNING"} 1
# Heartbeats par worker
socle_worker_heartbeats_total{worker="kafka-consumer"} 1234
socle_worker_missed_heartbeats{worker="kafka-consumer"} 0
3.2 Métriques KvBus
# Opérations
socle_kvbus_operations_total{operation="get"} 12345
socle_kvbus_operations_total{operation="put"} 6789
socle_kvbus_operations_total{operation="delete"} 234
# Latence
socle_kvbus_operation_duration_seconds{operation="get",quantile="0.5"} 0.001
socle_kvbus_operation_duration_seconds{operation="get",quantile="0.95"} 0.005
socle_kvbus_operation_duration_seconds{operation="get",quantile="0.99"} 0.01
# Nombre de clés
socle_kvbus_keys_count 456
3.3 Métriques Pipeline
# Exécutions
socle_pipeline_executions_total{pipeline="order-processing",status="SUCCESS"} 1234
socle_pipeline_executions_total{pipeline="order-processing",status="FAILURE"} 12
# Durée
socle_pipeline_duration_seconds{pipeline="order-processing",quantile="0.5"} 0.5
socle_pipeline_duration_seconds{pipeline="order-processing",quantile="0.95"} 2.0
socle_pipeline_duration_seconds{pipeline="order-processing",quantile="0.99"} 5.0
# Steps
socle_pipeline_step_duration_seconds{step="validation",quantile="0.5"} 0.01
socle_pipeline_step_duration_seconds{step="processing",quantile="0.5"} 0.3
3.4 Métriques Resilience
# Circuit breaker état (0=CLOSED, 1=HALF_OPEN, 2=OPEN)
socle_circuit_breaker_state{name="payment-gateway"} 0
# Tentatives de retry
socle_retry_attempts_total{operation="external-api",attempt="1",success="true"} 1000
socle_retry_attempts_total{operation="external-api",attempt="2",success="true"} 50
socle_retry_attempts_total{operation="external-api",attempt="3",success="false"} 5
3.5 Métriques TechDB (V4)
# Opérations
socle_techdb_operations_total{operation="saveOffset"} 5678
socle_techdb_operations_total{operation="getOffset"} 12345
# Taille des tables
socle_techdb_rows_count{table="socle_offsets"} 23
socle_techdb_rows_count{table="socle_events"} 456
socle_techdb_rows_count{table="socle_log_fallback"} 0
3.6 Métriques LogForwarder (V4)
# Queue
socle_logforwarder_queue_size 45
socle_logforwarder_queue_capacity 10000
# Logs envoyés
socle_logforwarder_logs_sent_total 123456
socle_logforwarder_logs_failed_total 23
socle_logforwarder_logs_fallback_total 0
# Batches
socle_logforwarder_batches_sent_total 1234
socle_logforwarder_batch_size{quantile="0.5"} 100
4. Implémentation
4.1 Enregistrement des métriques
package eu.lmvi.socle.metrics;
@Component
public class SocleMetrics {
private final MeterRegistry registry;
// Counters
private final Counter requestsTotal;
private final Counter errorsTotal;
// Gauges
private final AtomicInteger activeConnections = new AtomicInteger(0);
// Timers
private final Timer requestDuration;
public SocleMetrics(MeterRegistry registry) {
this.registry = registry;
// Counter
this.requestsTotal = Counter.builder("socle_requests_total")
.description("Total number of requests")
.register(registry);
this.errorsTotal = Counter.builder("socle_errors_total")
.description("Total number of errors")
.register(registry);
// Gauge
Gauge.builder("socle_active_connections", activeConnections, AtomicInteger::get)
.description("Number of active connections")
.register(registry);
// Timer
this.requestDuration = Timer.builder("socle_request_duration_seconds")
.description("Request duration in seconds")
.publishPercentiles(0.5, 0.95, 0.99)
.register(registry);
}
public void recordRequest() {
requestsTotal.increment();
}
public void recordError() {
errorsTotal.increment();
}
public void connectionOpened() {
activeConnections.incrementAndGet();
}
public void connectionClosed() {
activeConnections.decrementAndGet();
}
public Timer.Sample startTimer() {
return Timer.start(registry);
}
public void stopTimer(Timer.Sample sample) {
sample.stop(requestDuration);
}
}
4.2 Utilisation dans le code
@Service
public class OrderService {
@Autowired
private SocleMetrics metrics;
public Order processOrder(Order order) {
Timer.Sample sample = metrics.startTimer();
metrics.recordRequest();
try {
Order result = doProcess(order);
return result;
} catch (Exception e) {
metrics.recordError();
throw e;
} finally {
metrics.stopTimer(sample);
}
}
}
4.3 Métriques avec tags
@Component
public class WorkerMetrics {
private final MeterRegistry registry;
public void recordWorkerStatus(String workerName, String status) {
Gauge.builder("socle_worker_status", () -> 1)
.tag("worker", workerName)
.tag("status", status)
.register(registry);
}
public void recordProcessed(String workerName, String type) {
Counter.builder("socle_worker_processed_total")
.tag("worker", workerName)
.tag("type", type)
.register(registry)
.increment();
}
}
5. Endpoint Prometheus
5.1 Accès
curl http://localhost:8080/actuator/prometheus
5.2 Sortie
# HELP socle_workers_total Number of workers
# TYPE socle_workers_total gauge
socle_workers_total{application="socle-v4",environment="PROD",region="MTQ"} 5
# HELP socle_workers_healthy Number of healthy workers
# TYPE socle_workers_healthy gauge
socle_workers_healthy{application="socle-v4",environment="PROD",region="MTQ"} 5
# HELP socle_requests_total Total number of requests
# TYPE socle_requests_total counter
socle_requests_total{application="socle-v4",environment="PROD",region="MTQ"} 12345
# HELP socle_request_duration_seconds Request duration in seconds
# TYPE socle_request_duration_seconds summary
socle_request_duration_seconds{application="socle-v4",quantile="0.5"} 0.05
socle_request_duration_seconds{application="socle-v4",quantile="0.95"} 0.2
socle_request_duration_seconds{application="socle-v4",quantile="0.99"} 0.5
socle_request_duration_seconds_count{application="socle-v4"} 12345
socle_request_duration_seconds_sum{application="socle-v4"} 617.25
6. Prometheus Configuration
6.1 prometheus.yml
global:
scrape_interval: 15s
scrape_configs:
- job_name: 'socle-v4'
metrics_path: '/actuator/prometheus'
static_configs:
- targets: ['socle-app:8080']
labels:
app: 'socle-v4'
env: 'prod'
- job_name: 'socle-v4-kubernetes'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
6.2 Kubernetes annotations
apiVersion: v1
kind: Pod
metadata:
annotations:
prometheus.io/scrape: "true"
prometheus.io/path: "/actuator/prometheus"
prometheus.io/port: "8080"
7. Grafana Dashboards
7.1 Exemple de requêtes
# Taux de requêtes par seconde
rate(socle_requests_total[5m])
# Taux d'erreurs
rate(socle_errors_total[5m]) / rate(socle_requests_total[5m]) * 100
# Latence P95
histogram_quantile(0.95, rate(socle_request_duration_seconds_bucket[5m]))
# Workers unhealthy
socle_workers_unhealthy
# Circuit breakers ouverts
socle_circuit_breaker_state == 2
# Queue LogForwarder
socle_logforwarder_queue_size / socle_logforwarder_queue_capacity * 100
7.2 Dashboard JSON
{
"title": "Socle V4 Dashboard",
"panels": [
{
"title": "Request Rate",
"type": "graph",
"targets": [
{
"expr": "rate(socle_requests_total[5m])",
"legendFormat": "{{application}}"
}
]
},
{
"title": "Error Rate",
"type": "graph",
"targets": [
{
"expr": "rate(socle_errors_total[5m]) / rate(socle_requests_total[5m]) * 100",
"legendFormat": "Error %"
}
]
},
{
"title": "P95 Latency",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.95, rate(socle_request_duration_seconds_bucket[5m]))",
"legendFormat": "P95"
}
]
},
{
"title": "Workers Status",
"type": "stat",
"targets": [
{
"expr": "socle_workers_healthy",
"legendFormat": "Healthy"
}
]
}
]
}
8. Alerting
8.1 Prometheus Alertmanager rules
groups:
- name: socle-alerts
rules:
- alert: SocleHighErrorRate
expr: rate(socle_errors_total[5m]) / rate(socle_requests_total[5m]) > 0.05
for: 5m
labels:
severity: warning
annotations:
summary: "High error rate on {{ $labels.application }}"
description: "Error rate is {{ $value | humanizePercentage }}"
- alert: SocleWorkerUnhealthy
expr: socle_workers_unhealthy > 0
for: 2m
labels:
severity: critical
annotations:
summary: "Unhealthy workers on {{ $labels.application }}"
description: "{{ $value }} workers are unhealthy"
- alert: SocleCircuitBreakerOpen
expr: socle_circuit_breaker_state == 2
for: 5m
labels:
severity: warning
annotations:
summary: "Circuit breaker {{ $labels.name }} is OPEN"
- alert: SocleLogForwarderQueueHigh
expr: socle_logforwarder_queue_size / socle_logforwarder_queue_capacity > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "LogForwarder queue is {{ $value | humanizePercentage }} full"
9. Bonnes pratiques
DO
- Utiliser des noms de métriques cohérents (
socle_*) - Ajouter des tags pertinents (application, environment, region)
- Utiliser des histogrammes pour les latences
- Définir des alertes sur les métriques critiques
- Documenter les métriques
DON’T
- Ne pas créer trop de métriques (cardinalité)
- Ne pas utiliser de valeurs à haute cardinalité dans les tags
- Ne pas oublier les métriques d’erreur
- Ne pas ignorer les métriques de queue/buffer









