Custom Prometheus Exporters Desarrollo

Building custom exporters enables monitoreo application-specific metrics and proprietary systems. Esta guía covers developing exporters in Go and Python, understanding Prometheus metric types, implementing the exposition format, and testing exporters to expose business-critical metrics alongside infrastructure monitoreo.

Tabla de Contenidos

Introducción

Custom exporters bridge the gap between application code and Prometheus. They expose metrics about business logic, custom algorithms, queue depths, and application state that standard exporters cannot capture. Building exporters is straightforward with Prometheus client libraries.

Prometheus Métrica Types

Counter

Monotonically increasing value (never decreases):

# Application purchases
app_total_purchases{region="us"} 1000
app_total_purchases{region="eu"} 500

Gauge

Value that can increase or decrease:

# Current queue depth
app_queue_depth 42
app_memory_usage_bytes 104857600

Histogram

Samples observations and counts them in buckets:

# Request duration in seconds
app_request_duration_seconds_bucket{le="0.1"} 100
app_request_duration_seconds_bucket{le="0.5"} 500
app_request_duration_seconds_bucket{le="1.0"} 850
app_request_duration_seconds_bucket{le="+Inf"} 1000
app_request_duration_seconds_sum 5000
app_request_duration_seconds_count 1000

Summary

Similar to histogram but calculates quantiles:

# API latency percentiles
app_api_latency_seconds{quantile="0.5"} 0.05
app_api_latency_seconds{quantile="0.9"} 0.1
app_api_latency_seconds{quantile="0.99"} 0.2
app_api_latency_seconds_sum 500
app_api_latency_seconds_count 10000

Exposition Format

Text Format Structure

# HELP metric_name Metric description
# TYPE metric_name metric_type
metric_name{label1="value1",label2="value2"} value timestamp

Ejemplo Métricas

# HELP app_users Total active users
# TYPE app_users gauge
app_users{environment="production"} 5000

# HELP app_orders_total Total orders processed
# TYPE app_orders_total counter
app_orders_total{region="us"} 100000
app_orders_total{region="eu"} 75000

# HELP app_request_duration_seconds Request latency in seconds
# TYPE app_request_duration_seconds histogram
app_request_duration_seconds_bucket{le="0.1"} 100
app_request_duration_seconds_bucket{le="1.0"} 500
app_request_duration_seconds_bucket{le="10.0"} 950
app_request_duration_seconds_bucket{le="+Inf"} 1000
app_request_duration_seconds_sum 5000
app_request_duration_seconds_count 1000

Go Client Library

Basic Exporter

package main

import (
	"net/http"
	"log"
	
	"github.com/prometheus/client_golang/prometheus"
	"github.com/prometheus/client_golang/prometheus/promhttp"
)

var (
	activeUsers = prometheus.NewGauge(prometheus.GaugeOpts{
		Name: "app_active_users",
		Help: "Number of active users",
	})
	
	totalOrders = prometheus.NewCounterVec(prometheus.CounterOpts{
		Name: "app_orders_total",
		Help: "Total orders processed",
	}, []string{"region"})
	
	requestDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
		Name:    "app_request_duration_seconds",
		Help:    "Request latency in seconds",
		Buckets: []float64{0.1, 0.5, 1.0, 2.5, 5.0},
	}, []string{"endpoint"})
)

func init() {
	prometheus.MustRegister(activeUsers)
	prometheus.MustRegister(totalOrders)
	prometheus.MustRegister(requestDuration)
}

func main() {
	// Update metrics
	activeUsers.Set(5000)
	totalOrders.WithLabelValues("us").Add(100)
	totalOrders.WithLabelValues("eu").Add(75)
	
	// Expose metrics
	http.Handle("/metrics", promhttp.Handler())
	
	log.Fatal(http.ListenAndServe(":8080", nil))
}

Avanzado Go Exporter

package main

import (
	"context"
	"log"
	"net/http"
	"sync"
	"time"

	"github.com/prometheus/client_golang/prometheus"
	"github.com/prometheus/client_golang/prometheus/promhttp"
)

type Collector struct {
	activeConnections prometheus.Gauge
	cacheHits         prometheus.Counter
	cacheMisses       prometheus.Counter
	processingTime    prometheus.Histogram
}

func NewCollector() *Collector {
	return &Collector{
		activeConnections: prometheus.NewGauge(prometheus.GaugeOpts{
			Name: "app_active_connections",
			Help: "Current active connections",
		}),
		cacheHits: prometheus.NewCounter(prometheus.CounterOpts{
			Name: "app_cache_hits_total",
			Help: "Total cache hits",
		}),
		cacheMisses: prometheus.NewCounter(prometheus.CounterOpts{
			Name: "app_cache_misses_total",
			Help: "Total cache misses",
		}),
		processingTime: prometheus.NewHistogram(prometheus.HistogramOpts{
			Name:    "app_processing_time_seconds",
			Help:    "Processing time in seconds",
			Buckets: prometheus.DefBuckets,
		}),
	}
}

func (c *Collector) Register() {
	prometheus.MustRegister(c.activeConnections)
	prometheus.MustRegister(c.cacheHits)
	prometheus.MustRegister(c.cacheMisses)
	prometheus.MustRegister(c.processingTime)
}

func (c *Collector) RecordCacheHit() {
	c.cacheHits.Inc()
}

func (c *Collector) RecordCacheMiss() {
	c.cacheMisses.Inc()
}

func (c *Collector) MeasureOperation(fn func() error) error {
	timer := prometheus.NewTimer(c.processingTime)
	defer timer.ObserveDuration()
	return fn()
}

func main() {
	collector := NewCollector()
	collector.Register()
	
	// Simulate metric updates
	go func() {
		for {
			collector.RecordCacheHit()
			collector.RecordCacheMiss()
			collector.activeConnections.Set(42)
			time.Sleep(10 * time.Second)
		}
	}()
	
	http.Handle("/metrics", promhttp.Handler())
	log.Fatal(http.ListenAndServe(":8080", nil))
}

Python Client Library

Basic Python Exporter

from prometheus_client import start_http_server, Gauge, Counter, Histogram
import time
import random

# Create metrics
active_users = Gauge('app_active_users', 'Number of active users')
total_orders = Counter('app_orders_total', 'Total orders', ['region'])
request_duration = Histogram('app_request_duration_seconds', 
                            'Request duration', buckets=(0.1, 0.5, 1.0, 2.5, 5.0))

if __name__ == '__main__':
    # Start Prometheus metrics server on port 8000
    start_http_server(8000)
    
    # Update metrics
    while True:
        active_users.set(random.randint(4000, 6000))
        total_orders.labels(region='us').inc(random.randint(1, 10))
        total_orders.labels(region='eu').inc(random.randint(1, 10))
        
        with request_duration.time():
            time.sleep(random.uniform(0.1, 1.0))
        
        time.sleep(10)

Avanzado Python Exporter

from prometheus_client import start_http_server, CollectorRegistry, Gauge, Counter, Histogram
from prometheus_client.core import GaugeMetricFamily, CounterMetricFamily
import threading
import time

class ApplicationCollector:
    def __init__(self):
        self.active_connections = 0
        self.cache_hits = 0
        self.cache_misses = 0
        self._lock = threading.Lock()
    
    def collect(self):
        """Return metrics to Prometheus"""
        with self._lock:
            # Gauge metric
            c = GaugeMetricFamily('app_active_connections', 
                                 'Current active connections', 
                                 value=self.active_connections)
            yield c
            
            # Counter metrics
            c = CounterMetricFamily('app_cache_hits_total',
                                   'Total cache hits')
            c.add_metric([], self.cache_hits)
            yield c
            
            c = CounterMetricFamily('app_cache_misses_total',
                                   'Total cache misses')
            c.add_metric([], self.cache_misses)
            yield c
    
    def record_cache_hit(self):
        with self._lock:
            self.cache_hits += 1
    
    def record_cache_miss(self):
        with self._lock:
            self.cache_misses += 1
    
    def set_connections(self, count):
        with self._lock:
            self.active_connections = count

if __name__ == '__main__':
    registry = CollectorRegistry()
    collector = ApplicationCollector()
    registry.register(collector)
    
    start_http_server(8000, registry=registry)
    
    # Simulate metric updates
    while True:
        collector.set_connections(random.randint(100, 500))
        collector.record_cache_hit()
        if random.random() > 0.8:
            collector.record_cache_miss()
        time.sleep(10)

Pruebas Exporters

Unit Tests (Go)

package main

import (
	"testing"
	"github.com/prometheus/client_golang/prometheus"
)

func TestMetricRecording(t *testing.T) {
	collector := NewCollector()
	
	// Test cache hit recording
	collector.RecordCacheHit()
	collector.RecordCacheHit()
	
	// Verify metric (simplified check)
	t.Logf("Cache hits recorded")
}

func TestConnectionTracking(t *testing.T) {
	collector := NewCollector()
	
	collector.activeConnections.Set(42)
	t.Logf("Connections set to 42")
}

Unit Tests (Python)

import pytest
from prometheus_client import CollectorRegistry, Gauge, Counter

def test_gauge_metric():
    registry = CollectorRegistry()
    g = Gauge('test_gauge', 'Test gauge', registry=registry)
    
    g.set(42)
    assert g._value.get() == 42

def test_counter_metric():
    registry = CollectorRegistry()
    c = Counter('test_counter', 'Test counter', registry=registry)
    
    c.inc()
    c.inc(5)
    assert c._value.get() == 6

def test_collector():
    collector = ApplicationCollector()
    
    collector.record_cache_hit()
    collector.record_cache_miss()
    collector.set_connections(100)
    
    # Generate metrics
    metrics = list(collector.collect())
    assert len(metrics) == 3

Integración Tests

# Test exporter endpoint
curl -s http://localhost:8080/metrics | grep app_

# Verify Prometheus scrape
curl -s http://localhost:9090/api/v1/query?query=app_cache_hits_total

# Load test exporter
ab -n 1000 http://localhost:8080/metrics

Mejores Prácticas

Naming Conventions

# Format: <namespace>_<subsystem>_<metric>_<unit>
app_cache_hits_total        # Counter
app_processing_time_seconds # Histogram
app_queue_depth             # Gauge
app_memory_bytes            # Gauge with bytes unit

Label Strategy

// ✓ Good - consistent labels
totalOrders := prometheus.NewCounterVec(prometheus.CounterOpts{
    Name: "app_orders_total",
    Help: "Total orders",
}, []string{"region", "product_type"})

// ✗ Bad - unbounded cardinality
userLabels := prometheus.NewCounterVec(prometheus.CounterOpts{
    Name: "app_users_total",
    Help: "Users",
}, []string{"user_id"})  // Could be millions of unique values

Error Handling

try:
    metric.set(value)
except Exception as e:
    # Don't fail, log and continue
    logger.error(f"Failed to update metric: {e}")

Despliegue

Docker Container

FROM golang:1.20 as builder
WORKDIR /app
COPY . .
RUN go build -o exporter .

FROM alpine:latest
COPY --from=builder /app/exporter /
EXPOSE 8080
CMD ["/exporter"]

Systemd Servicio

[Unit]
Description=Custom Application Exporter
After=network.target

[Service]
Type=simple
ExecStart=/usr/local/bin/app-exporter
Restart=always

[Install]
WantedBy=multi-user.target

Prometheus Configuración

scrape_configs:
  - job_name: 'app-exporter'
    static_configs:
      - targets: ['localhost:8080']
    scrape_interval: 30s

Solución de Problemas

Verificar Métricas Export

# Check metrics endpoint
curl http://localhost:8080/metrics

# Filter specific metrics
curl http://localhost:8080/metrics | grep app_

# Test Prometheus query
curl 'http://localhost:9090/api/v1/query?query=app_cache_hits_total'

Debug Issues

# Enable debug logging
DEBUG=true ./app-exporter

# Check metric types
curl http://localhost:8080/metrics | grep "# TYPE"

# Validate format
curl http://localhost:8080/metrics | promtool query instant 'up'

Conclusión

Custom exporters extend observability to business logic and proprietary systems. By following Esta guía, you've learned to build exporters that expose application metrics alongside infrastructure monitoreo. Focus on choosing appropriate metric types, designing label structures that avoid cardinality explosion, and thoroughly testing exporters before despliegue. Custom metrics enable real-time visibility ina su applications' behavior.