feat: observability, IP filtering, stdout/clickhouse fixes (v1.1.11)
- feat(observability): metrics server with /metrics and /health endpoints - feat(observability): correlation metrics (events, success/failed, reasons, buffers) - feat(correlation): IP exclusion filter (exact IPs and CIDR ranges) - feat(correlation): pending orphan delay for late-arriving B events - fix(stdout): sink is now a no-op for data; JSON must never appear on stdout - fix(clickhouse): all flush errors were silently discarded, now properly logged - fix(clickhouse): buffer overflow with DropOnOverflow now logged at WARN - fix(clickhouse): retry attempts logged at WARN with attempt/delay/error context - feat(clickhouse): connection success logged at INFO, batch sends at DEBUG - feat(clickhouse): SetLogger() for external logger injection - test(stdout): assert stdout remains empty for correlated and orphan logs - chore(rpm): bump version to 1.1.11, update changelog - docs: README and architecture.yml updated Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@ -164,9 +164,9 @@ func (s *UnixSocketSource) readDatagrams(ctx context.Context, eventChan chan<- *
|
||||
continue
|
||||
}
|
||||
|
||||
// Debug: log raw events
|
||||
s.logger.Debugf("event received: source=%s src_ip=%s src_port=%d",
|
||||
event.Source, event.SrcIP, event.SrcPort)
|
||||
// Debug: log raw events with all key details
|
||||
s.logger.Debugf("event received: source=%s src_ip=%s src_port=%d timestamp=%v raw_timestamp=%v",
|
||||
event.Source, event.SrcIP, event.SrcPort, event.Timestamp, event.Raw["timestamp"])
|
||||
|
||||
select {
|
||||
case eventChan <- event:
|
||||
|
||||
@ -12,6 +12,7 @@ import (
|
||||
|
||||
"github.com/ClickHouse/clickhouse-go/v2"
|
||||
"github.com/logcorrelator/logcorrelator/internal/domain"
|
||||
"github.com/logcorrelator/logcorrelator/internal/observability"
|
||||
)
|
||||
|
||||
const (
|
||||
@ -53,6 +54,12 @@ type ClickHouseSink struct {
|
||||
done chan struct{}
|
||||
wg sync.WaitGroup
|
||||
closeOnce sync.Once
|
||||
logger *observability.Logger
|
||||
}
|
||||
|
||||
// SetLogger sets the logger used by the sink.
|
||||
func (s *ClickHouseSink) SetLogger(logger *observability.Logger) {
|
||||
s.logger = logger.WithFields(map[string]any{"sink": "clickhouse"})
|
||||
}
|
||||
|
||||
// NewClickHouseSink creates a new ClickHouse sink.
|
||||
@ -83,6 +90,7 @@ func NewClickHouseSink(config Config) (*ClickHouseSink, error) {
|
||||
buffer: make([]domain.CorrelatedLog, 0, config.BatchSize),
|
||||
flushChan: make(chan struct{}, 1),
|
||||
done: make(chan struct{}),
|
||||
logger: observability.NewLogger("clickhouse"),
|
||||
}
|
||||
|
||||
// Parse DSN and create options
|
||||
@ -107,6 +115,8 @@ func NewClickHouseSink(config Config) (*ClickHouseSink, error) {
|
||||
}
|
||||
|
||||
s.conn = conn
|
||||
s.log().Infof("connected to ClickHouse: table=%s batch_size=%d flush_interval_ms=%d",
|
||||
config.Table, config.BatchSize, config.FlushIntervalMs)
|
||||
|
||||
// Start flush goroutine
|
||||
s.wg.Add(1)
|
||||
@ -120,6 +130,14 @@ func (s *ClickHouseSink) Name() string {
|
||||
return "clickhouse"
|
||||
}
|
||||
|
||||
// log returns the logger, initializing a default one if not set (e.g. in tests).
|
||||
func (s *ClickHouseSink) log() *observability.Logger {
|
||||
if s.logger == nil {
|
||||
s.logger = observability.NewLogger("clickhouse")
|
||||
}
|
||||
return s.logger
|
||||
}
|
||||
|
||||
// Reopen is a no-op for ClickHouse (connection is managed internally).
|
||||
func (s *ClickHouseSink) Reopen() error {
|
||||
return nil
|
||||
@ -146,6 +164,7 @@ func (s *ClickHouseSink) Write(ctx context.Context, log domain.CorrelatedLog) er
|
||||
s.mu.Unlock()
|
||||
|
||||
if drop {
|
||||
s.log().Warnf("buffer full, dropping log: table=%s buffer_size=%d", s.config.Table, s.config.MaxBufferSize)
|
||||
return nil
|
||||
}
|
||||
if time.Now().After(deadline) {
|
||||
@ -201,7 +220,9 @@ func (s *ClickHouseSink) flushLoop() {
|
||||
select {
|
||||
case <-s.done:
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(s.config.TimeoutMs)*time.Millisecond)
|
||||
_ = s.doFlush(ctx)
|
||||
if err := s.doFlush(ctx); err != nil {
|
||||
s.log().Error("final flush on close failed", err)
|
||||
}
|
||||
cancel()
|
||||
return
|
||||
|
||||
@ -212,7 +233,9 @@ func (s *ClickHouseSink) flushLoop() {
|
||||
|
||||
if needsFlush {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(s.config.TimeoutMs)*time.Millisecond)
|
||||
_ = s.doFlush(ctx)
|
||||
if err := s.doFlush(ctx); err != nil {
|
||||
s.log().Error("periodic flush failed", err)
|
||||
}
|
||||
cancel()
|
||||
}
|
||||
|
||||
@ -223,7 +246,9 @@ func (s *ClickHouseSink) flushLoop() {
|
||||
|
||||
if needsFlush {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(s.config.TimeoutMs)*time.Millisecond)
|
||||
_ = s.doFlush(ctx)
|
||||
if err := s.doFlush(ctx); err != nil {
|
||||
s.log().Error("batch flush failed", err)
|
||||
}
|
||||
cancel()
|
||||
}
|
||||
}
|
||||
@ -247,7 +272,6 @@ func (s *ClickHouseSink) doFlush(ctx context.Context) error {
|
||||
return fmt.Errorf("clickhouse connection is not initialized")
|
||||
}
|
||||
|
||||
// Log batch info before sending
|
||||
batchSize := len(buffer)
|
||||
|
||||
// Retry logic with exponential backoff
|
||||
@ -255,6 +279,8 @@ func (s *ClickHouseSink) doFlush(ctx context.Context) error {
|
||||
for attempt := 0; attempt < MaxRetries; attempt++ {
|
||||
if attempt > 0 {
|
||||
delay := RetryBaseDelay * time.Duration(1<<uint(attempt-1))
|
||||
s.log().Warnf("retrying batch insert: attempt=%d/%d delay=%s rows=%d err=%v",
|
||||
attempt+1, MaxRetries, delay, batchSize, lastErr)
|
||||
select {
|
||||
case <-time.After(delay):
|
||||
case <-ctx.Done():
|
||||
@ -264,6 +290,7 @@ func (s *ClickHouseSink) doFlush(ctx context.Context) error {
|
||||
|
||||
lastErr = s.executeBatch(ctx, buffer)
|
||||
if lastErr == nil {
|
||||
s.log().Debugf("batch sent: rows=%d table=%s", batchSize, s.config.Table)
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
@ -2,41 +2,22 @@ package stdout
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/logcorrelator/logcorrelator/internal/domain"
|
||||
"github.com/logcorrelator/logcorrelator/internal/observability"
|
||||
)
|
||||
|
||||
// Config holds the stdout sink configuration.
|
||||
type Config struct {
|
||||
Enabled bool
|
||||
Level string // DEBUG, INFO, WARN, ERROR - filters output verbosity
|
||||
}
|
||||
|
||||
// StdoutSink writes correlated logs to stdout as JSON lines.
|
||||
type StdoutSink struct {
|
||||
config Config
|
||||
mu sync.Mutex
|
||||
enc *json.Encoder
|
||||
minLevel observability.LogLevel
|
||||
}
|
||||
// StdoutSink is a no-op data sink. Operational logs are written to stderr
|
||||
// by the observability.Logger; correlated data must never appear on stdout.
|
||||
type StdoutSink struct{}
|
||||
|
||||
// NewStdoutSink creates a new stdout sink.
|
||||
func NewStdoutSink(config Config) *StdoutSink {
|
||||
level := observability.INFO
|
||||
if config.Level != "" {
|
||||
level = observability.ParseLogLevel(strings.ToUpper(config.Level))
|
||||
}
|
||||
return &StdoutSink{
|
||||
config: config,
|
||||
enc: json.NewEncoder(os.Stdout),
|
||||
minLevel: level,
|
||||
}
|
||||
return &StdoutSink{}
|
||||
}
|
||||
|
||||
// Name returns the sink name.
|
||||
@ -49,50 +30,17 @@ func (s *StdoutSink) Reopen() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Write writes a correlated log to stdout.
|
||||
func (s *StdoutSink) Write(ctx context.Context, log domain.CorrelatedLog) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
|
||||
// Filter output based on correlation status and log level
|
||||
// DEBUG: all logs, INFO: only correlated, WARN: only correlated with issues, ERROR: none
|
||||
if !s.shouldEmit(log) {
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := s.enc.Encode(log); err != nil {
|
||||
return fmt.Errorf("failed to write to stdout: %w", err)
|
||||
}
|
||||
|
||||
// Write is a no-op: correlated data must never be written to stdout.
|
||||
func (s *StdoutSink) Write(_ context.Context, _ domain.CorrelatedLog) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// shouldEmit determines if a log should be emitted based on its correlation status and configured level.
|
||||
func (s *StdoutSink) shouldEmit(log domain.CorrelatedLog) bool {
|
||||
switch s.minLevel {
|
||||
case observability.DEBUG:
|
||||
// Emit everything including orphans
|
||||
return true
|
||||
case observability.INFO:
|
||||
// Emit all correlated logs, skip orphans
|
||||
return log.Correlated
|
||||
case observability.WARN:
|
||||
// Emit only correlated logs (same as INFO for now, can be extended)
|
||||
return log.Correlated
|
||||
case observability.ERROR:
|
||||
// Never emit to stdout at ERROR level
|
||||
return false
|
||||
default:
|
||||
return log.Correlated
|
||||
}
|
||||
}
|
||||
|
||||
// Flush flushes any buffered data (no-op for stdout).
|
||||
func (s *StdoutSink) Flush(ctx context.Context) error {
|
||||
// Flush is a no-op for stdout.
|
||||
func (s *StdoutSink) Flush(_ context.Context) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Close closes the sink (no-op for stdout).
|
||||
// Close is a no-op for stdout.
|
||||
func (s *StdoutSink) Close() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
81
internal/adapters/outbound/stdout/sink_test.go
Normal file
81
internal/adapters/outbound/stdout/sink_test.go
Normal file
@ -0,0 +1,81 @@
|
||||
package stdout
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"os"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/logcorrelator/logcorrelator/internal/domain"
|
||||
)
|
||||
|
||||
func makeLog(correlated bool) domain.CorrelatedLog {
|
||||
return domain.CorrelatedLog{
|
||||
Timestamp: time.Unix(1700000000, 0),
|
||||
SrcIP: "1.2.3.4",
|
||||
SrcPort: 12345,
|
||||
Correlated: correlated,
|
||||
}
|
||||
}
|
||||
|
||||
// captureStdout replaces os.Stdout temporarily and returns what was written.
|
||||
func captureStdout(t *testing.T, fn func()) string {
|
||||
t.Helper()
|
||||
r, w, err := os.Pipe()
|
||||
if err != nil {
|
||||
t.Fatalf("os.Pipe: %v", err)
|
||||
}
|
||||
old := os.Stdout
|
||||
os.Stdout = w
|
||||
|
||||
fn()
|
||||
|
||||
w.Close()
|
||||
os.Stdout = old
|
||||
|
||||
var buf bytes.Buffer
|
||||
buf.ReadFrom(r)
|
||||
r.Close()
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
func TestStdoutSink_Name(t *testing.T) {
|
||||
s := NewStdoutSink(Config{Enabled: true})
|
||||
if s.Name() != "stdout" {
|
||||
t.Errorf("expected name 'stdout', got %q", s.Name())
|
||||
}
|
||||
}
|
||||
|
||||
// TestStdoutSink_WriteDoesNotProduceOutput verifies that no JSON data
|
||||
// (correlated or not) is ever written to stdout.
|
||||
func TestStdoutSink_WriteDoesNotProduceOutput(t *testing.T) {
|
||||
s := NewStdoutSink(Config{Enabled: true})
|
||||
|
||||
got := captureStdout(t, func() {
|
||||
if err := s.Write(context.Background(), makeLog(true)); err != nil {
|
||||
t.Fatalf("Write(correlated) returned error: %v", err)
|
||||
}
|
||||
if err := s.Write(context.Background(), makeLog(false)); err != nil {
|
||||
t.Fatalf("Write(orphan) returned error: %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
if got != "" {
|
||||
t.Errorf("stdout must be empty but got: %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestStdoutSink_NoopMethods(t *testing.T) {
|
||||
s := NewStdoutSink(Config{Enabled: true})
|
||||
|
||||
if err := s.Flush(context.Background()); err != nil {
|
||||
t.Errorf("Flush returned error: %v", err)
|
||||
}
|
||||
if err := s.Close(); err != nil {
|
||||
t.Errorf("Close returned error: %v", err)
|
||||
}
|
||||
if err := s.Reopen(); err != nil {
|
||||
t.Errorf("Reopen returned error: %v", err)
|
||||
}
|
||||
}
|
||||
@ -2,6 +2,7 @@ package config
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
@ -17,6 +18,13 @@ type Config struct {
|
||||
Inputs InputsConfig `yaml:"inputs"`
|
||||
Outputs OutputsConfig `yaml:"outputs"`
|
||||
Correlation CorrelationConfig `yaml:"correlation"`
|
||||
Metrics MetricsConfig `yaml:"metrics"`
|
||||
}
|
||||
|
||||
// MetricsConfig holds metrics server configuration.
|
||||
type MetricsConfig struct {
|
||||
Enabled bool `yaml:"enabled"`
|
||||
Addr string `yaml:"addr"` // e.g., ":8080", "localhost:8080"
|
||||
}
|
||||
|
||||
// LogConfig holds logging configuration.
|
||||
@ -85,11 +93,12 @@ type StdoutOutputConfig struct {
|
||||
|
||||
// CorrelationConfig holds correlation configuration.
|
||||
type CorrelationConfig struct {
|
||||
TimeWindow TimeWindowConfig `yaml:"time_window"`
|
||||
OrphanPolicy OrphanPolicyConfig `yaml:"orphan_policy"`
|
||||
Matching MatchingConfig `yaml:"matching"`
|
||||
Buffers BuffersConfig `yaml:"buffers"`
|
||||
TTL TTLConfig `yaml:"ttl"`
|
||||
TimeWindow TimeWindowConfig `yaml:"time_window"`
|
||||
OrphanPolicy OrphanPolicyConfig `yaml:"orphan_policy"`
|
||||
Matching MatchingConfig `yaml:"matching"`
|
||||
Buffers BuffersConfig `yaml:"buffers"`
|
||||
TTL TTLConfig `yaml:"ttl"`
|
||||
ExcludeSourceIPs []string `yaml:"exclude_source_ips"` // List of source IPs or CIDR ranges to exclude
|
||||
// Deprecated: Use TimeWindow.Value instead
|
||||
TimeWindowS int `yaml:"time_window_s"`
|
||||
// Deprecated: Use OrphanPolicy.ApacheAlwaysEmit instead
|
||||
@ -341,3 +350,48 @@ func (c *UnixSocketConfig) GetSocketPermissions() os.FileMode {
|
||||
|
||||
return os.FileMode(perms)
|
||||
}
|
||||
|
||||
// GetExcludeSourceIPs returns the list of excluded source IPs or CIDR ranges.
|
||||
func (c *CorrelationConfig) GetExcludeSourceIPs() []string {
|
||||
return c.ExcludeSourceIPs
|
||||
}
|
||||
|
||||
// IsSourceIPExcluded checks if a source IP should be excluded.
|
||||
// Supports both exact IP matches and CIDR ranges.
|
||||
func (c *CorrelationConfig) IsSourceIPExcluded(ip string) bool {
|
||||
if len(c.ExcludeSourceIPs) == 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
// Parse the IP once
|
||||
parsedIP := net.ParseIP(ip)
|
||||
if parsedIP == nil {
|
||||
return false // Invalid IP
|
||||
}
|
||||
|
||||
for _, exclude := range c.ExcludeSourceIPs {
|
||||
// Try CIDR first
|
||||
if strings.Contains(exclude, "/") {
|
||||
_, cidr, err := net.ParseCIDR(exclude)
|
||||
if err != nil {
|
||||
continue // Invalid CIDR, skip
|
||||
}
|
||||
if cidr.Contains(parsedIP) {
|
||||
return true
|
||||
}
|
||||
} else {
|
||||
// Exact IP match
|
||||
if exclude == ip {
|
||||
return true
|
||||
}
|
||||
// Also try parsing as IP (handles different formats like 192.168.1.1 vs 192.168.001.001)
|
||||
if excludeIP := net.ParseIP(exclude); excludeIP != nil {
|
||||
if excludeIP.Equal(parsedIP) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
@ -2,6 +2,8 @@ package domain
|
||||
|
||||
import (
|
||||
"container/list"
|
||||
"net"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
@ -41,6 +43,7 @@ type CorrelationConfig struct {
|
||||
MaxNetworkBufferSize int // Maximum events to buffer for source B (Network)
|
||||
NetworkTTLS int // TTL in seconds for network events (source B)
|
||||
MatchingMode string // "one_to_one" or "one_to_many" (Keep-Alive)
|
||||
ExcludeSourceIPs []string // List of source IPs or CIDR ranges to exclude
|
||||
}
|
||||
|
||||
// pendingOrphan represents an A event waiting to be emitted as orphan.
|
||||
@ -52,16 +55,18 @@ type pendingOrphan struct {
|
||||
|
||||
// CorrelationService handles the correlation logic between source A and B events.
|
||||
type CorrelationService struct {
|
||||
config CorrelationConfig
|
||||
mu sync.Mutex
|
||||
bufferA *eventBuffer
|
||||
bufferB *eventBuffer
|
||||
pendingA map[string][]*list.Element // key -> ordered elements containing *NormalizedEvent
|
||||
pendingB map[string][]*list.Element
|
||||
networkTTLs map[*list.Element]time.Time // TTL expiration time for each B event
|
||||
pendingOrphans map[string][]*pendingOrphan // key -> A events waiting to be emitted as orphans
|
||||
timeProvider TimeProvider
|
||||
logger *observability.Logger
|
||||
config CorrelationConfig
|
||||
mu sync.Mutex
|
||||
bufferA *eventBuffer
|
||||
bufferB *eventBuffer
|
||||
pendingA map[string][]*list.Element // key -> ordered elements containing *NormalizedEvent
|
||||
pendingB map[string][]*list.Element
|
||||
networkTTLs map[*list.Element]time.Time // TTL expiration time for each B event
|
||||
pendingOrphans map[string][]*pendingOrphan // key -> A events waiting to be emitted as orphans
|
||||
timeProvider TimeProvider
|
||||
logger *observability.Logger
|
||||
metrics *observability.CorrelationMetrics
|
||||
excludeIPsParsed []*net.IPNet // Parsed CIDR ranges for efficient lookup
|
||||
}
|
||||
|
||||
type eventBuffer struct {
|
||||
@ -111,6 +116,17 @@ func NewCorrelationService(config CorrelationConfig, timeProvider TimeProvider)
|
||||
config.ApacheEmitDelayMs = DefaultApacheEmitDelayMs
|
||||
}
|
||||
|
||||
// Parse excluded IPs (CIDR ranges) for efficient lookup
|
||||
var excludeIPsParsed []*net.IPNet
|
||||
for _, exclude := range config.ExcludeSourceIPs {
|
||||
if strings.Contains(exclude, "/") {
|
||||
_, cidr, err := net.ParseCIDR(exclude)
|
||||
if err == nil {
|
||||
excludeIPsParsed = append(excludeIPsParsed, cidr)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return &CorrelationService{
|
||||
config: config,
|
||||
bufferA: newEventBuffer(),
|
||||
@ -121,6 +137,8 @@ func NewCorrelationService(config CorrelationConfig, timeProvider TimeProvider)
|
||||
networkTTLs: make(map[*list.Element]time.Time),
|
||||
timeProvider: timeProvider,
|
||||
logger: observability.NewLogger("correlation"),
|
||||
metrics: observability.NewCorrelationMetrics(),
|
||||
excludeIPsParsed: excludeIPsParsed,
|
||||
}
|
||||
}
|
||||
|
||||
@ -129,11 +147,56 @@ func (s *CorrelationService) SetLogger(logger *observability.Logger) {
|
||||
s.logger = logger
|
||||
}
|
||||
|
||||
// isIPExcluded checks if a source IP should be excluded from correlation.
|
||||
func (s *CorrelationService) isIPExcluded(ip string) bool {
|
||||
// Check exact IP matches first (from config)
|
||||
for _, exclude := range s.config.ExcludeSourceIPs {
|
||||
if !strings.Contains(exclude, "/") {
|
||||
// Exact IP match
|
||||
if exclude == ip {
|
||||
return true
|
||||
}
|
||||
if excludeIP := net.ParseIP(exclude); excludeIP != nil {
|
||||
if parsedIP := net.ParseIP(ip); parsedIP != nil {
|
||||
if excludeIP.Equal(parsedIP) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check CIDR ranges
|
||||
parsedIP := net.ParseIP(ip)
|
||||
if parsedIP == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
for _, cidr := range s.excludeIPsParsed {
|
||||
if cidr.Contains(parsedIP) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// ProcessEvent processes an incoming event and returns correlated logs if matches are found.
|
||||
func (s *CorrelationService) ProcessEvent(event *NormalizedEvent) []CorrelatedLog {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
|
||||
// Check if source IP is excluded
|
||||
if s.isIPExcluded(event.SrcIP) {
|
||||
s.logger.Debugf("event excluded by IP filter: source=%s src_ip=%s src_port=%d",
|
||||
event.Source, event.SrcIP, event.SrcPort)
|
||||
s.metrics.RecordCorrelationFailed("ip_excluded")
|
||||
return nil
|
||||
}
|
||||
|
||||
// Record event received
|
||||
s.metrics.RecordEventReceived(string(event.Source))
|
||||
|
||||
// Clean expired events first
|
||||
s.cleanExpired()
|
||||
|
||||
@ -145,6 +208,7 @@ func (s *CorrelationService) ProcessEvent(event *NormalizedEvent) []CorrelatedLo
|
||||
// Buffer full - rotate oldest event instead of dropping new one
|
||||
s.logger.Warnf("buffer full, rotating oldest event: source=%s src_ip=%s src_port=%d",
|
||||
event.Source, event.SrcIP, event.SrcPort)
|
||||
s.metrics.RecordCorrelationFailed("buffer_eviction")
|
||||
if event.Source == SourceA {
|
||||
// Remove oldest A event and emit as orphan if configured
|
||||
s.rotateOldestA()
|
||||
@ -166,6 +230,7 @@ func (s *CorrelationService) ProcessEvent(event *NormalizedEvent) []CorrelatedLo
|
||||
case SourceB:
|
||||
results, shouldBuffer = s.processSourceB(event)
|
||||
default:
|
||||
s.logger.Warnf("unknown event source: %s", event.Source)
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -175,6 +240,9 @@ func (s *CorrelationService) ProcessEvent(event *NormalizedEvent) []CorrelatedLo
|
||||
event.Source, event.SrcIP, event.SrcPort, s.getBufferSize(event.Source))
|
||||
}
|
||||
|
||||
// Update buffer sizes in metrics
|
||||
s.metrics.UpdateBufferSizes(int64(s.bufferA.events.Len()), int64(s.bufferB.events.Len()))
|
||||
|
||||
// Combine orphan results with correlation results
|
||||
if len(orphanResults) > 0 {
|
||||
results = append(orphanResults, results...)
|
||||
@ -248,6 +316,7 @@ func (s *CorrelationService) rotateOldestB() {
|
||||
|
||||
func (s *CorrelationService) processSourceA(event *NormalizedEvent) ([]CorrelatedLog, bool) {
|
||||
key := event.CorrelationKey()
|
||||
s.logger.Debugf("processing A event: key=%s timestamp=%v", key, event.Timestamp)
|
||||
|
||||
// Look for matching B events
|
||||
matches := s.findMatches(s.bufferB, s.pendingB, key, func(other *NormalizedEvent) bool {
|
||||
@ -259,9 +328,10 @@ func (s *CorrelationService) processSourceA(event *NormalizedEvent) ([]Correlate
|
||||
// Correlate with all matching B events (one-to-many)
|
||||
for _, bEvent := range matches {
|
||||
correlated := NewCorrelatedLog(event, bEvent)
|
||||
s.logger.Debugf("correlation found: A(src_ip=%s src_port=%d) + B(src_ip=%s src_port=%d)",
|
||||
event.SrcIP, event.SrcPort, bEvent.SrcIP, bEvent.SrcPort)
|
||||
s.logger.Debugf("correlation found: A(src_ip=%s src_port=%d ts=%v) + B(src_ip=%s src_port=%d ts=%v)",
|
||||
event.SrcIP, event.SrcPort, event.Timestamp, bEvent.SrcIP, bEvent.SrcPort, bEvent.Timestamp)
|
||||
results = append(results, correlated)
|
||||
s.metrics.RecordCorrelationSuccess()
|
||||
|
||||
// Reset TTL for matched B event (Keep-Alive)
|
||||
if s.config.MatchingMode == MatchingModeOneToMany {
|
||||
@ -271,6 +341,9 @@ func (s *CorrelationService) processSourceA(event *NormalizedEvent) ([]Correlate
|
||||
for _, elem := range elements {
|
||||
if elem.Value.(*NormalizedEvent) == bEvent {
|
||||
s.resetNetworkTTL(elem)
|
||||
s.metrics.RecordKeepAliveReset()
|
||||
s.logger.Debugf("TTL reset for B event (Keep-Alive): key=%s new_ttl=%v",
|
||||
bKey, s.config.NetworkTTLS)
|
||||
break
|
||||
}
|
||||
}
|
||||
@ -286,6 +359,25 @@ func (s *CorrelationService) processSourceA(event *NormalizedEvent) ([]Correlate
|
||||
return results, false
|
||||
}
|
||||
|
||||
// No match found - check if there are B events with same key but outside time window
|
||||
if bElements, ok := s.pendingB[key]; ok && len(bElements) > 0 {
|
||||
// Key exists but no time match - log the time difference for debugging
|
||||
for _, bElem := range bElements {
|
||||
bEvent := bElem.Value.(*NormalizedEvent)
|
||||
timeDiff := event.Timestamp.Sub(bEvent.Timestamp)
|
||||
if timeDiff < 0 {
|
||||
timeDiff = -timeDiff
|
||||
}
|
||||
s.logger.Debugf("A event has same key as B but outside time window: key=%s time_diff=%v window=%v",
|
||||
key, timeDiff, s.config.TimeWindow)
|
||||
}
|
||||
s.metrics.RecordCorrelationFailed("time_window")
|
||||
} else {
|
||||
// No B events with same key at all
|
||||
s.logger.Debugf("A event has no matching B key in buffer: key=%s", key)
|
||||
s.metrics.RecordCorrelationFailed("no_match_key")
|
||||
}
|
||||
|
||||
// No match found - orphan A event
|
||||
// Instead of emitting immediately, add to pending orphans with delay
|
||||
// This allows B events to arrive slightly after A and still correlate
|
||||
@ -294,9 +386,11 @@ func (s *CorrelationService) processSourceA(event *NormalizedEvent) ([]Correlate
|
||||
if s.config.ApacheEmitDelayMs == 0 {
|
||||
orphan := NewCorrelatedLogFromEvent(event, "A")
|
||||
s.logger.Warnf("orphan A event (immediate): src_ip=%s src_port=%d", event.SrcIP, event.SrcPort)
|
||||
s.metrics.RecordOrphanEmitted("A")
|
||||
return []CorrelatedLog{orphan}, false
|
||||
}
|
||||
s.addPendingOrphan(event)
|
||||
s.metrics.RecordPendingOrphan()
|
||||
s.logger.Debugf("A event added to pending orphans (delay=%dms): src_ip=%s src_port=%d",
|
||||
s.config.ApacheEmitDelayMs, event.SrcIP, event.SrcPort)
|
||||
// Don't emit yet - will be emitted after delay expires
|
||||
@ -310,13 +404,16 @@ func (s *CorrelationService) processSourceA(event *NormalizedEvent) ([]Correlate
|
||||
|
||||
func (s *CorrelationService) processSourceB(event *NormalizedEvent) ([]CorrelatedLog, bool) {
|
||||
key := event.CorrelationKey()
|
||||
s.logger.Debugf("processing B event: key=%s timestamp=%v", key, event.Timestamp)
|
||||
|
||||
// FIRST: Check if there's a pending orphan A that matches this B event
|
||||
// This is the key optimization for delayed orphan emission
|
||||
if aEvent := s.checkPendingOrphansForCorrelation(event); aEvent != nil {
|
||||
correlated := NewCorrelatedLog(aEvent, event)
|
||||
s.logger.Debugf("correlation found (pending orphan): A(src_ip=%s src_port=%d) + B(src_ip=%s src_port=%d)",
|
||||
aEvent.SrcIP, aEvent.SrcPort, event.SrcIP, event.SrcPort)
|
||||
s.logger.Debugf("correlation found (pending orphan): A(src_ip=%s src_port=%d ts=%v) + B(src_ip=%s src_port=%d ts=%v)",
|
||||
aEvent.SrcIP, aEvent.SrcPort, aEvent.Timestamp, event.SrcIP, event.SrcPort, event.Timestamp)
|
||||
s.metrics.RecordCorrelationSuccess()
|
||||
s.metrics.RecordPendingOrphanMatch()
|
||||
return []CorrelatedLog{correlated}, false
|
||||
}
|
||||
|
||||
@ -325,12 +422,34 @@ func (s *CorrelationService) processSourceB(event *NormalizedEvent) ([]Correlate
|
||||
return s.eventsMatch(other, event)
|
||||
}); aEvent != nil {
|
||||
correlated := NewCorrelatedLog(aEvent, event)
|
||||
s.logger.Debugf("correlation found: A(src_ip=%s src_port=%d) + B(src_ip=%s src_port=%d)",
|
||||
aEvent.SrcIP, aEvent.SrcPort, event.SrcIP, event.SrcPort)
|
||||
s.logger.Debugf("correlation found: A(src_ip=%s src_port=%d ts=%v) + B(src_ip=%s src_port=%d ts=%v)",
|
||||
aEvent.SrcIP, aEvent.SrcPort, aEvent.Timestamp, event.SrcIP, event.SrcPort, event.Timestamp)
|
||||
s.metrics.RecordCorrelationSuccess()
|
||||
return []CorrelatedLog{correlated}, false
|
||||
}
|
||||
|
||||
// No match found - check if there are A events with same key but outside time window
|
||||
if aElements, ok := s.pendingA[key]; ok && len(aElements) > 0 {
|
||||
// Key exists but no time match - log the time difference for debugging
|
||||
for _, aElem := range aElements {
|
||||
aEvent := aElem.Value.(*NormalizedEvent)
|
||||
timeDiff := aEvent.Timestamp.Sub(event.Timestamp)
|
||||
if timeDiff < 0 {
|
||||
timeDiff = -timeDiff
|
||||
}
|
||||
s.logger.Debugf("B event has same key as A but outside time window: key=%s time_diff=%v window=%v",
|
||||
key, timeDiff, s.config.TimeWindow)
|
||||
}
|
||||
s.metrics.RecordCorrelationFailed("time_window")
|
||||
} else {
|
||||
// No A events with same key at all
|
||||
s.logger.Debugf("B event has no matching A key in buffer: key=%s", key)
|
||||
s.metrics.RecordCorrelationFailed("no_match_key")
|
||||
}
|
||||
|
||||
// Never emit B alone. Keep in buffer for potential future match.
|
||||
s.logger.Debugf("B event buffered (no match yet): key=%s src_ip=%s src_port=%d",
|
||||
key, event.SrcIP, event.SrcPort)
|
||||
return nil, true
|
||||
}
|
||||
|
||||
@ -417,8 +536,12 @@ func (s *CorrelationService) cleanBufferAByBTTL() {
|
||||
}
|
||||
|
||||
if s.config.ApacheAlwaysEmit {
|
||||
s.logger.Warnf("orphan A event (no B match, TTL expired): src_ip=%s src_port=%d",
|
||||
event.SrcIP, event.SrcPort)
|
||||
s.logger.Warnf("orphan A event (no B match, TTL expired): src_ip=%s src_port=%d key=%s",
|
||||
event.SrcIP, event.SrcPort, key)
|
||||
s.metrics.RecordOrphanEmitted("A")
|
||||
} else {
|
||||
s.logger.Debugf("A event removed (no valid B, TTL expired): src_ip=%s src_port=%d key=%s",
|
||||
event.SrcIP, event.SrcPort, key)
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -448,6 +571,7 @@ func (s *CorrelationService) cleanBuffer(buffer *eventBuffer, pending map[string
|
||||
// rather than the original event timestamp.
|
||||
func (s *CorrelationService) cleanNetworkBufferByTTL() {
|
||||
now := s.timeProvider.Now()
|
||||
var removed int
|
||||
for elem, ttl := range s.networkTTLs {
|
||||
if now.After(ttl) {
|
||||
event := elem.Value.(*NormalizedEvent)
|
||||
@ -458,8 +582,15 @@ func (s *CorrelationService) cleanNetworkBufferByTTL() {
|
||||
delete(s.pendingB, key)
|
||||
}
|
||||
delete(s.networkTTLs, elem)
|
||||
removed++
|
||||
s.logger.Debugf("B event TTL expired: key=%s src_ip=%s src_port=%d ttl=%v age=%v",
|
||||
key, event.SrcIP, event.SrcPort, s.config.NetworkTTLS, now.Sub(event.Timestamp))
|
||||
s.metrics.RecordCorrelationFailed("ttl_expired")
|
||||
}
|
||||
}
|
||||
if removed > 0 {
|
||||
s.logger.Debugf("cleaned %d expired B events", removed)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *CorrelationService) findAndPopFirstMatch(
|
||||
@ -635,19 +766,20 @@ func (s *CorrelationService) emitPendingOrphans() []CorrelatedLog {
|
||||
if !s.config.ApacheAlwaysEmit {
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
now := s.timeProvider.Now()
|
||||
var results []CorrelatedLog
|
||||
|
||||
|
||||
for key, orphans := range s.pendingOrphans {
|
||||
for i := len(orphans) - 1; i >= 0; i-- {
|
||||
if now.After(orphans[i].emitAfter) {
|
||||
// Time to emit this orphan
|
||||
orphan := NewCorrelatedLogFromEvent(orphans[i].event, "A")
|
||||
s.logger.Warnf("orphan A event (emit delay expired): src_ip=%s src_port=%d",
|
||||
orphans[i].event.SrcIP, orphans[i].event.SrcPort)
|
||||
s.logger.Warnf("orphan A event (emit delay expired): src_ip=%s src_port=%d key=%s delay_ms=%d",
|
||||
orphans[i].event.SrcIP, orphans[i].event.SrcPort, key, s.config.ApacheEmitDelayMs)
|
||||
s.metrics.RecordOrphanEmitted("A")
|
||||
results = append(results, orphan)
|
||||
|
||||
|
||||
// Remove from pending
|
||||
s.pendingOrphans[key] = append(orphans[:i], orphans[i+1:]...)
|
||||
if len(s.pendingOrphans[key]) == 0 {
|
||||
@ -656,7 +788,7 @@ func (s *CorrelationService) emitPendingOrphans() []CorrelatedLog {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
@ -755,3 +887,15 @@ func (s *CorrelationService) GetBufferSizes() (int, int) {
|
||||
defer s.mu.Unlock()
|
||||
return s.bufferA.events.Len(), s.bufferB.events.Len()
|
||||
}
|
||||
|
||||
// GetMetrics returns the correlation metrics (for monitoring/debugging).
|
||||
func (s *CorrelationService) GetMetrics() *observability.CorrelationMetrics {
|
||||
return s.metrics
|
||||
}
|
||||
|
||||
// GetMetricsSnapshot returns a point-in-time snapshot of metrics.
|
||||
func (s *CorrelationService) GetMetricsSnapshot() observability.MetricsSnapshot {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
return s.metrics.Snapshot()
|
||||
}
|
||||
|
||||
176
internal/observability/metrics.go
Normal file
176
internal/observability/metrics.go
Normal file
@ -0,0 +1,176 @@
|
||||
package observability
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
)
|
||||
|
||||
// CorrelationMetrics tracks correlation statistics for debugging and monitoring.
|
||||
type CorrelationMetrics struct {
|
||||
mu sync.RWMutex
|
||||
|
||||
// Events received
|
||||
eventsReceivedA atomic.Int64
|
||||
eventsReceivedB atomic.Int64
|
||||
|
||||
// Correlation results
|
||||
correlationsSuccess atomic.Int64
|
||||
correlationsFailed atomic.Int64
|
||||
|
||||
// Failure reasons
|
||||
failedNoMatchKey atomic.Int64 // No event with same key in buffer
|
||||
failedTimeWindow atomic.Int64 // Key found but outside time window
|
||||
failedBufferEviction atomic.Int64 // Event evicted due to buffer full
|
||||
failedTTLExpired atomic.Int64 // B event TTL expired before match
|
||||
failedIPExcluded atomic.Int64 // Event excluded by IP filter
|
||||
|
||||
// Buffer stats
|
||||
bufferASize atomic.Int64
|
||||
bufferBSize atomic.Int64
|
||||
|
||||
// Orphan stats
|
||||
orphansEmittedA atomic.Int64
|
||||
orphansEmittedB atomic.Int64
|
||||
orphansPendingA atomic.Int64
|
||||
pendingOrphanMatch atomic.Int64 // B matched with pending orphan A
|
||||
|
||||
// Keep-Alive stats
|
||||
keepAliveResets atomic.Int64 // Number of TTL resets (one-to-many mode)
|
||||
}
|
||||
|
||||
// NewCorrelationMetrics creates a new metrics tracker.
|
||||
func NewCorrelationMetrics() *CorrelationMetrics {
|
||||
return &CorrelationMetrics{}
|
||||
}
|
||||
|
||||
// RecordEventReceived records an event received from a source.
|
||||
func (m *CorrelationMetrics) RecordEventReceived(source string) {
|
||||
if source == "A" {
|
||||
m.eventsReceivedA.Add(1)
|
||||
} else if source == "B" {
|
||||
m.eventsReceivedB.Add(1)
|
||||
}
|
||||
}
|
||||
|
||||
// RecordCorrelationSuccess records a successful correlation.
|
||||
func (m *CorrelationMetrics) RecordCorrelationSuccess() {
|
||||
m.correlationsSuccess.Add(1)
|
||||
}
|
||||
|
||||
// RecordCorrelationFailed records a failed correlation attempt with the reason.
|
||||
func (m *CorrelationMetrics) RecordCorrelationFailed(reason string) {
|
||||
m.correlationsFailed.Add(1)
|
||||
switch reason {
|
||||
case "no_match_key":
|
||||
m.failedNoMatchKey.Add(1)
|
||||
case "time_window":
|
||||
m.failedTimeWindow.Add(1)
|
||||
case "buffer_eviction":
|
||||
m.failedBufferEviction.Add(1)
|
||||
case "ttl_expired":
|
||||
m.failedTTLExpired.Add(1)
|
||||
case "ip_excluded":
|
||||
m.failedIPExcluded.Add(1)
|
||||
}
|
||||
}
|
||||
|
||||
// RecordBufferEviction records an event evicted from buffer.
|
||||
func (m *CorrelationMetrics) RecordBufferEviction(source string) {
|
||||
// Can be used for additional tracking if needed
|
||||
}
|
||||
|
||||
// RecordOrphanEmitted records an orphan event emitted.
|
||||
func (m *CorrelationMetrics) RecordOrphanEmitted(source string) {
|
||||
if source == "A" {
|
||||
m.orphansEmittedA.Add(1)
|
||||
} else if source == "B" {
|
||||
m.orphansEmittedB.Add(1)
|
||||
}
|
||||
}
|
||||
|
||||
// RecordPendingOrphan records an A event added to pending orphans.
|
||||
func (m *CorrelationMetrics) RecordPendingOrphan() {
|
||||
m.orphansPendingA.Add(1)
|
||||
}
|
||||
|
||||
// RecordPendingOrphanMatch records a B event matching a pending orphan A.
|
||||
func (m *CorrelationMetrics) RecordPendingOrphanMatch() {
|
||||
m.pendingOrphanMatch.Add(1)
|
||||
}
|
||||
|
||||
// RecordKeepAliveReset records a TTL reset for Keep-Alive.
|
||||
func (m *CorrelationMetrics) RecordKeepAliveReset() {
|
||||
m.keepAliveResets.Add(1)
|
||||
}
|
||||
|
||||
// UpdateBufferSizes updates the current buffer sizes.
|
||||
func (m *CorrelationMetrics) UpdateBufferSizes(sizeA, sizeB int64) {
|
||||
m.bufferASize.Store(sizeA)
|
||||
m.bufferBSize.Store(sizeB)
|
||||
}
|
||||
|
||||
// Snapshot returns a point-in-time snapshot of all metrics.
|
||||
func (m *CorrelationMetrics) Snapshot() MetricsSnapshot {
|
||||
return MetricsSnapshot{
|
||||
EventsReceivedA: m.eventsReceivedA.Load(),
|
||||
EventsReceivedB: m.eventsReceivedB.Load(),
|
||||
CorrelationsSuccess: m.correlationsSuccess.Load(),
|
||||
CorrelationsFailed: m.correlationsFailed.Load(),
|
||||
FailedNoMatchKey: m.failedNoMatchKey.Load(),
|
||||
FailedTimeWindow: m.failedTimeWindow.Load(),
|
||||
FailedBufferEviction: m.failedBufferEviction.Load(),
|
||||
FailedTTLExpired: m.failedTTLExpired.Load(),
|
||||
FailedIPExcluded: m.failedIPExcluded.Load(),
|
||||
BufferASize: m.bufferASize.Load(),
|
||||
BufferBSize: m.bufferBSize.Load(),
|
||||
OrphansEmittedA: m.orphansEmittedA.Load(),
|
||||
OrphansEmittedB: m.orphansEmittedB.Load(),
|
||||
OrphansPendingA: m.orphansPendingA.Load(),
|
||||
PendingOrphanMatch: m.pendingOrphanMatch.Load(),
|
||||
KeepAliveResets: m.keepAliveResets.Load(),
|
||||
}
|
||||
}
|
||||
|
||||
// MetricsSnapshot is a point-in-time snapshot of metrics.
|
||||
type MetricsSnapshot struct {
|
||||
EventsReceivedA int64 `json:"events_received_a"`
|
||||
EventsReceivedB int64 `json:"events_received_b"`
|
||||
CorrelationsSuccess int64 `json:"correlations_success"`
|
||||
CorrelationsFailed int64 `json:"correlations_failed"`
|
||||
FailedNoMatchKey int64 `json:"failed_no_match_key"`
|
||||
FailedTimeWindow int64 `json:"failed_time_window"`
|
||||
FailedBufferEviction int64 `json:"failed_buffer_eviction"`
|
||||
FailedTTLExpired int64 `json:"failed_ttl_expired"`
|
||||
FailedIPExcluded int64 `json:"failed_ip_excluded"`
|
||||
BufferASize int64 `json:"buffer_a_size"`
|
||||
BufferBSize int64 `json:"buffer_b_size"`
|
||||
OrphansEmittedA int64 `json:"orphans_emitted_a"`
|
||||
OrphansEmittedB int64 `json:"orphans_emitted_b"`
|
||||
OrphansPendingA int64 `json:"orphans_pending_a"`
|
||||
PendingOrphanMatch int64 `json:"pending_orphan_match"`
|
||||
KeepAliveResets int64 `json:"keepalive_resets"`
|
||||
}
|
||||
|
||||
// MarshalJSON implements json.Marshaler.
|
||||
func (m *CorrelationMetrics) MarshalJSON() ([]byte, error) {
|
||||
return json.Marshal(m.Snapshot())
|
||||
}
|
||||
|
||||
// String returns a human-readable string of metrics.
|
||||
func (m *CorrelationMetrics) String() string {
|
||||
s := m.Snapshot()
|
||||
var b strings.Builder
|
||||
b.WriteString("Correlation Metrics:\n")
|
||||
fmt.Fprintf(&b, " Events Received: A=%d B=%d Total=%d\n", s.EventsReceivedA, s.EventsReceivedB, s.EventsReceivedA+s.EventsReceivedB)
|
||||
fmt.Fprintf(&b, " Correlations: Success=%d Failed=%d\n", s.CorrelationsSuccess, s.CorrelationsFailed)
|
||||
fmt.Fprintf(&b, " Failure Reasons: no_match_key=%d time_window=%d buffer_eviction=%d ttl_expired=%d ip_excluded=%d\n",
|
||||
s.FailedNoMatchKey, s.FailedTimeWindow, s.FailedBufferEviction, s.FailedTTLExpired, s.FailedIPExcluded)
|
||||
fmt.Fprintf(&b, " Buffer Sizes: A=%d B=%d\n", s.BufferASize, s.BufferBSize)
|
||||
fmt.Fprintf(&b, " Orphans: Emitted A=%d B=%d Pending A=%d\n", s.OrphansEmittedA, s.OrphansEmittedB, s.OrphansPendingA)
|
||||
fmt.Fprintf(&b, " Pending Orphan Match: %d\n", s.PendingOrphanMatch)
|
||||
fmt.Fprintf(&b, " Keep-Alive Resets: %d\n", s.KeepAliveResets)
|
||||
return b.String()
|
||||
}
|
||||
128
internal/observability/metrics_server.go
Normal file
128
internal/observability/metrics_server.go
Normal file
@ -0,0 +1,128 @@
|
||||
package observability
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net"
|
||||
"net/http"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// MetricsServer exposes correlation metrics via HTTP.
|
||||
type MetricsServer struct {
|
||||
mu sync.Mutex
|
||||
server *http.Server
|
||||
listener net.Listener
|
||||
metricsFunc func() MetricsSnapshot
|
||||
running bool
|
||||
}
|
||||
|
||||
// NewMetricsServer creates a new metrics HTTP server.
|
||||
func NewMetricsServer(addr string, metricsFunc func() MetricsSnapshot) (*MetricsServer, error) {
|
||||
if metricsFunc == nil {
|
||||
return nil, fmt.Errorf("metricsFunc cannot be nil")
|
||||
}
|
||||
|
||||
ms := &MetricsServer{
|
||||
metricsFunc: metricsFunc,
|
||||
}
|
||||
|
||||
mux := http.NewServeMux()
|
||||
mux.HandleFunc("/metrics", ms.handleMetrics)
|
||||
mux.HandleFunc("/health", ms.handleHealth)
|
||||
|
||||
ms.server = &http.Server{
|
||||
Addr: addr,
|
||||
Handler: mux,
|
||||
ReadTimeout: 5 * time.Second,
|
||||
WriteTimeout: 10 * time.Second,
|
||||
}
|
||||
|
||||
return ms, nil
|
||||
}
|
||||
|
||||
// Start begins listening on the configured address.
|
||||
func (ms *MetricsServer) Start() error {
|
||||
ms.mu.Lock()
|
||||
defer ms.mu.Unlock()
|
||||
|
||||
if ms.running {
|
||||
return nil
|
||||
}
|
||||
|
||||
listener, err := net.Listen("tcp", ms.server.Addr)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to start metrics server: %w", err)
|
||||
}
|
||||
|
||||
ms.listener = listener
|
||||
ms.running = true
|
||||
|
||||
go func() {
|
||||
if err := ms.server.Serve(listener); err != nil && err != http.ErrServerClosed {
|
||||
// Server error or closed
|
||||
}
|
||||
}()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Stop gracefully stops the metrics server.
|
||||
func (ms *MetricsServer) Stop(ctx context.Context) error {
|
||||
ms.mu.Lock()
|
||||
defer ms.mu.Unlock()
|
||||
|
||||
if !ms.running {
|
||||
return nil
|
||||
}
|
||||
|
||||
ms.running = false
|
||||
return ms.server.Shutdown(ctx)
|
||||
}
|
||||
|
||||
// handleMetrics returns the correlation metrics as JSON.
|
||||
func (ms *MetricsServer) handleMetrics(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodGet {
|
||||
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
|
||||
return
|
||||
}
|
||||
|
||||
metrics := ms.metricsFunc()
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
if err := json.NewEncoder(w).Encode(metrics); err != nil {
|
||||
http.Error(w, "Failed to encode metrics", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// handleHealth returns a simple health check response.
|
||||
func (ms *MetricsServer) handleHealth(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodGet {
|
||||
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
|
||||
return
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
fmt.Fprintf(w, `{"status":"healthy"}`)
|
||||
}
|
||||
|
||||
// IsRunning returns true if the server is running.
|
||||
func (ms *MetricsServer) IsRunning() bool {
|
||||
ms.mu.Lock()
|
||||
defer ms.mu.Unlock()
|
||||
return ms.running
|
||||
}
|
||||
|
||||
// Addr returns the listening address.
|
||||
func (ms *MetricsServer) Addr() string {
|
||||
ms.mu.Lock()
|
||||
defer ms.mu.Unlock()
|
||||
if ms.listener == nil {
|
||||
return ""
|
||||
}
|
||||
return ms.listener.Addr().String()
|
||||
}
|
||||
Reference in New Issue
Block a user