feat: observability, IP filtering, stdout/clickhouse fixes (v1.1.11)

- feat(observability): metrics server with /metrics and /health endpoints - feat(observability): correlation metrics (events, success/failed, reasons, buffers) - feat(correlation): IP exclusion filter (exact IPs and CIDR ranges) - feat(correlation): pending orphan delay for late-arriving B events - fix(stdout): sink is now a no-op for data; JSON must never appear on stdout - fix(clickhouse): all flush errors were silently discarded, now properly logged - fix(clickhouse): buffer overflow with DropOnOverflow now logged at WARN - fix(clickhouse): retry attempts logged at WARN with attempt/delay/error context - feat(clickhouse): connection success logged at INFO, batch sends at DEBUG - feat(clickhouse): SetLogger() for external logger injection - test(stdout): assert stdout remains empty for correlated and orphan logs - chore(rpm): bump version to 1.1.11, update changelog - docs: README and architecture.yml updated Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-03-05 11:40:54 +01:00
parent 68f0fcf810
commit e9dcd8ea51
16 changed files with 2035 additions and 116 deletions
--- a/internal/observability/metrics.go
+++ b/internal/observability/metrics.go
@ -0,0 +1,176 @@
+package observability
+
+import (
+	"encoding/json"
+	"fmt"
+	"strings"
+	"sync"
+	"sync/atomic"
+)
+
+// CorrelationMetrics tracks correlation statistics for debugging and monitoring.
+type CorrelationMetrics struct {
+	mu sync.RWMutex
+
+	// Events received
+	eventsReceivedA atomic.Int64
+	eventsReceivedB atomic.Int64
+
+	// Correlation results
+	correlationsSuccess atomic.Int64
+	correlationsFailed  atomic.Int64
+
+	// Failure reasons
+	failedNoMatchKey      atomic.Int64 // No event with same key in buffer
+	failedTimeWindow      atomic.Int64 // Key found but outside time window
+	failedBufferEviction  atomic.Int64 // Event evicted due to buffer full
+	failedTTLExpired      atomic.Int64 // B event TTL expired before match
+	failedIPExcluded      atomic.Int64 // Event excluded by IP filter
+
+	// Buffer stats
+	bufferASize atomic.Int64
+	bufferBSize atomic.Int64
+
+	// Orphan stats
+	orphansEmittedA  atomic.Int64
+	orphansEmittedB  atomic.Int64
+	orphansPendingA  atomic.Int64
+	pendingOrphanMatch atomic.Int64 // B matched with pending orphan A
+
+	// Keep-Alive stats
+	keepAliveResets atomic.Int64 // Number of TTL resets (one-to-many mode)
+}
+
+// NewCorrelationMetrics creates a new metrics tracker.
+func NewCorrelationMetrics() *CorrelationMetrics {
+	return &CorrelationMetrics{}
+}
+
+// RecordEventReceived records an event received from a source.
+func (m *CorrelationMetrics) RecordEventReceived(source string) {
+	if source == "A" {
+		m.eventsReceivedA.Add(1)
+	} else if source == "B" {
+		m.eventsReceivedB.Add(1)
+	}
+}
+
+// RecordCorrelationSuccess records a successful correlation.
+func (m *CorrelationMetrics) RecordCorrelationSuccess() {
+	m.correlationsSuccess.Add(1)
+}
+
+// RecordCorrelationFailed records a failed correlation attempt with the reason.
+func (m *CorrelationMetrics) RecordCorrelationFailed(reason string) {
+	m.correlationsFailed.Add(1)
+	switch reason {
+	case "no_match_key":
+		m.failedNoMatchKey.Add(1)
+	case "time_window":
+		m.failedTimeWindow.Add(1)
+	case "buffer_eviction":
+		m.failedBufferEviction.Add(1)
+	case "ttl_expired":
+		m.failedTTLExpired.Add(1)
+	case "ip_excluded":
+		m.failedIPExcluded.Add(1)
+	}
+}
+
+// RecordBufferEviction records an event evicted from buffer.
+func (m *CorrelationMetrics) RecordBufferEviction(source string) {
+	// Can be used for additional tracking if needed
+}
+
+// RecordOrphanEmitted records an orphan event emitted.
+func (m *CorrelationMetrics) RecordOrphanEmitted(source string) {
+	if source == "A" {
+		m.orphansEmittedA.Add(1)
+	} else if source == "B" {
+		m.orphansEmittedB.Add(1)
+	}
+}
+
+// RecordPendingOrphan records an A event added to pending orphans.
+func (m *CorrelationMetrics) RecordPendingOrphan() {
+	m.orphansPendingA.Add(1)
+}
+
+// RecordPendingOrphanMatch records a B event matching a pending orphan A.
+func (m *CorrelationMetrics) RecordPendingOrphanMatch() {
+	m.pendingOrphanMatch.Add(1)
+}
+
+// RecordKeepAliveReset records a TTL reset for Keep-Alive.
+func (m *CorrelationMetrics) RecordKeepAliveReset() {
+	m.keepAliveResets.Add(1)
+}
+
+// UpdateBufferSizes updates the current buffer sizes.
+func (m *CorrelationMetrics) UpdateBufferSizes(sizeA, sizeB int64) {
+	m.bufferASize.Store(sizeA)
+	m.bufferBSize.Store(sizeB)
+}
+
+// Snapshot returns a point-in-time snapshot of all metrics.
+func (m *CorrelationMetrics) Snapshot() MetricsSnapshot {
+	return MetricsSnapshot{
+		EventsReceivedA:       m.eventsReceivedA.Load(),
+		EventsReceivedB:       m.eventsReceivedB.Load(),
+		CorrelationsSuccess:   m.correlationsSuccess.Load(),
+		CorrelationsFailed:    m.correlationsFailed.Load(),
+		FailedNoMatchKey:      m.failedNoMatchKey.Load(),
+		FailedTimeWindow:      m.failedTimeWindow.Load(),
+		FailedBufferEviction:  m.failedBufferEviction.Load(),
+		FailedTTLExpired:      m.failedTTLExpired.Load(),
+		FailedIPExcluded:      m.failedIPExcluded.Load(),
+		BufferASize:           m.bufferASize.Load(),
+		BufferBSize:           m.bufferBSize.Load(),
+		OrphansEmittedA:       m.orphansEmittedA.Load(),
+		OrphansEmittedB:       m.orphansEmittedB.Load(),
+		OrphansPendingA:       m.orphansPendingA.Load(),
+		PendingOrphanMatch:    m.pendingOrphanMatch.Load(),
+		KeepAliveResets:       m.keepAliveResets.Load(),
+	}
+}
+
+// MetricsSnapshot is a point-in-time snapshot of metrics.
+type MetricsSnapshot struct {
+	EventsReceivedA      int64 `json:"events_received_a"`
+	EventsReceivedB      int64 `json:"events_received_b"`
+	CorrelationsSuccess  int64 `json:"correlations_success"`
+	CorrelationsFailed   int64 `json:"correlations_failed"`
+	FailedNoMatchKey     int64 `json:"failed_no_match_key"`
+	FailedTimeWindow     int64 `json:"failed_time_window"`
+	FailedBufferEviction int64 `json:"failed_buffer_eviction"`
+	FailedTTLExpired     int64 `json:"failed_ttl_expired"`
+	FailedIPExcluded     int64 `json:"failed_ip_excluded"`
+	BufferASize          int64 `json:"buffer_a_size"`
+	BufferBSize          int64 `json:"buffer_b_size"`
+	OrphansEmittedA      int64 `json:"orphans_emitted_a"`
+	OrphansEmittedB      int64 `json:"orphans_emitted_b"`
+	OrphansPendingA      int64 `json:"orphans_pending_a"`
+	PendingOrphanMatch   int64 `json:"pending_orphan_match"`
+	KeepAliveResets      int64 `json:"keepalive_resets"`
+}
+
+// MarshalJSON implements json.Marshaler.
+func (m *CorrelationMetrics) MarshalJSON() ([]byte, error) {
+	return json.Marshal(m.Snapshot())
+}
+
+// String returns a human-readable string of metrics.
+func (m *CorrelationMetrics) String() string {
+	s := m.Snapshot()
+	var b strings.Builder
+	b.WriteString("Correlation Metrics:\n")
+	fmt.Fprintf(&b, "  Events Received: A=%d B=%d Total=%d\n", s.EventsReceivedA, s.EventsReceivedB, s.EventsReceivedA+s.EventsReceivedB)
+	fmt.Fprintf(&b, "  Correlations: Success=%d Failed=%d\n", s.CorrelationsSuccess, s.CorrelationsFailed)
+	fmt.Fprintf(&b, "  Failure Reasons: no_match_key=%d time_window=%d buffer_eviction=%d ttl_expired=%d ip_excluded=%d\n",
+		s.FailedNoMatchKey, s.FailedTimeWindow, s.FailedBufferEviction, s.FailedTTLExpired, s.FailedIPExcluded)
+	fmt.Fprintf(&b, "  Buffer Sizes: A=%d B=%d\n", s.BufferASize, s.BufferBSize)
+	fmt.Fprintf(&b, "  Orphans: Emitted A=%d B=%d Pending A=%d\n", s.OrphansEmittedA, s.OrphansEmittedB, s.OrphansPendingA)
+	fmt.Fprintf(&b, "  Pending Orphan Match: %d\n", s.PendingOrphanMatch)
+	fmt.Fprintf(&b, "  Keep-Alive Resets: %d\n", s.KeepAliveResets)
+	return b.String()
+}
--- a/internal/observability/metrics_server.go
+++ b/internal/observability/metrics_server.go
@ -0,0 +1,128 @@
+package observability
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"net"
+	"net/http"
+	"sync"
+	"time"
+)
+
+// MetricsServer exposes correlation metrics via HTTP.
+type MetricsServer struct {
+	mu          sync.Mutex
+	server      *http.Server
+	listener    net.Listener
+	metricsFunc func() MetricsSnapshot
+	running     bool
+}
+
+// NewMetricsServer creates a new metrics HTTP server.
+func NewMetricsServer(addr string, metricsFunc func() MetricsSnapshot) (*MetricsServer, error) {
+	if metricsFunc == nil {
+		return nil, fmt.Errorf("metricsFunc cannot be nil")
+	}
+
+	ms := &MetricsServer{
+		metricsFunc: metricsFunc,
+	}
+
+	mux := http.NewServeMux()
+	mux.HandleFunc("/metrics", ms.handleMetrics)
+	mux.HandleFunc("/health", ms.handleHealth)
+
+	ms.server = &http.Server{
+		Addr:         addr,
+		Handler:      mux,
+		ReadTimeout:  5 * time.Second,
+		WriteTimeout: 10 * time.Second,
+	}
+
+	return ms, nil
+}
+
+// Start begins listening on the configured address.
+func (ms *MetricsServer) Start() error {
+	ms.mu.Lock()
+	defer ms.mu.Unlock()
+
+	if ms.running {
+		return nil
+	}
+
+	listener, err := net.Listen("tcp", ms.server.Addr)
+	if err != nil {
+		return fmt.Errorf("failed to start metrics server: %w", err)
+	}
+
+	ms.listener = listener
+	ms.running = true
+
+	go func() {
+		if err := ms.server.Serve(listener); err != nil && err != http.ErrServerClosed {
+			// Server error or closed
+		}
+	}()
+
+	return nil
+}
+
+// Stop gracefully stops the metrics server.
+func (ms *MetricsServer) Stop(ctx context.Context) error {
+	ms.mu.Lock()
+	defer ms.mu.Unlock()
+
+	if !ms.running {
+		return nil
+	}
+
+	ms.running = false
+	return ms.server.Shutdown(ctx)
+}
+
+// handleMetrics returns the correlation metrics as JSON.
+func (ms *MetricsServer) handleMetrics(w http.ResponseWriter, r *http.Request) {
+	if r.Method != http.MethodGet {
+		http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
+		return
+	}
+
+	metrics := ms.metricsFunc()
+
+	w.Header().Set("Content-Type", "application/json")
+	if err := json.NewEncoder(w).Encode(metrics); err != nil {
+		http.Error(w, "Failed to encode metrics", http.StatusInternalServerError)
+		return
+	}
+}
+
+// handleHealth returns a simple health check response.
+func (ms *MetricsServer) handleHealth(w http.ResponseWriter, r *http.Request) {
+	if r.Method != http.MethodGet {
+		http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
+		return
+	}
+
+	w.Header().Set("Content-Type", "application/json")
+	w.WriteHeader(http.StatusOK)
+	fmt.Fprintf(w, `{"status":"healthy"}`)
+}
+
+// IsRunning returns true if the server is running.
+func (ms *MetricsServer) IsRunning() bool {
+	ms.mu.Lock()
+	defer ms.mu.Unlock()
+	return ms.running
+}
+
+// Addr returns the listening address.
+func (ms *MetricsServer) Addr() string {
+	ms.mu.Lock()
+	defer ms.mu.Unlock()
+	if ms.listener == nil {
+		return ""
+	}
+	return ms.listener.Addr().String()
+}