feat: observability, IP filtering, stdout/clickhouse fixes (v1.1.11)

- feat(observability): metrics server with /metrics and /health endpoints
- feat(observability): correlation metrics (events, success/failed, reasons, buffers)
- feat(correlation): IP exclusion filter (exact IPs and CIDR ranges)
- feat(correlation): pending orphan delay for late-arriving B events
- fix(stdout): sink is now a no-op for data; JSON must never appear on stdout
- fix(clickhouse): all flush errors were silently discarded, now properly logged
- fix(clickhouse): buffer overflow with DropOnOverflow now logged at WARN
- fix(clickhouse): retry attempts logged at WARN with attempt/delay/error context
- feat(clickhouse): connection success logged at INFO, batch sends at DEBUG
- feat(clickhouse): SetLogger() for external logger injection
- test(stdout): assert stdout remains empty for correlated and orphan logs
- chore(rpm): bump version to 1.1.11, update changelog
- docs: README and architecture.yml updated

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
toto
2026-03-05 11:40:54 +01:00
parent 68f0fcf810
commit e9dcd8ea51
16 changed files with 2035 additions and 116 deletions

View File

@ -1,11 +1,13 @@
package main
import (
"context"
"flag"
"fmt"
"os"
"os/signal"
"syscall"
"time"
"github.com/logcorrelator/logcorrelator/internal/adapters/inbound/unixsocket"
"github.com/logcorrelator/logcorrelator/internal/adapters/outbound/clickhouse"
@ -88,17 +90,15 @@ func main() {
logger.Error("Failed to create ClickHouse sink", err)
os.Exit(1)
}
clickHouseSink.SetLogger(logger)
sinks = append(sinks, clickHouseSink)
logger.Info(fmt.Sprintf("Configured ClickHouse sink: table=%s", cfg.Outputs.ClickHouse.Table))
}
if cfg.Outputs.Stdout.Enabled {
stdoutSink := stdout.NewStdoutSink(stdout.Config{
Enabled: true,
Level: cfg.Outputs.Stdout.Level,
})
stdoutSink := stdout.NewStdoutSink(stdout.Config{Enabled: true})
sinks = append(sinks, stdoutSink)
logger.Info(fmt.Sprintf("Configured stdout sink: level=%s", cfg.Outputs.Stdout.Level))
logger.Info("Configured stdout sink (operational logs on stderr)")
}
// Create multi-sink wrapper
@ -106,14 +106,15 @@ func main() {
// Create correlation service
correlationSvc := domain.NewCorrelationService(domain.CorrelationConfig{
TimeWindow: cfg.Correlation.GetTimeWindow(),
ApacheAlwaysEmit: cfg.Correlation.GetApacheAlwaysEmit(),
ApacheEmitDelayMs: cfg.Correlation.GetApacheEmitDelayMs(),
NetworkEmit: false,
MaxHTTPBufferSize: cfg.Correlation.GetMaxHTTPBufferSize(),
TimeWindow: cfg.Correlation.GetTimeWindow(),
ApacheAlwaysEmit: cfg.Correlation.GetApacheAlwaysEmit(),
ApacheEmitDelayMs: cfg.Correlation.GetApacheEmitDelayMs(),
NetworkEmit: false,
MaxHTTPBufferSize: cfg.Correlation.GetMaxHTTPBufferSize(),
MaxNetworkBufferSize: cfg.Correlation.GetMaxNetworkBufferSize(),
NetworkTTLS: cfg.Correlation.GetNetworkTTLS(),
MatchingMode: cfg.Correlation.GetMatchingMode(),
NetworkTTLS: cfg.Correlation.GetNetworkTTLS(),
MatchingMode: cfg.Correlation.GetMatchingMode(),
ExcludeSourceIPs: cfg.Correlation.GetExcludeSourceIPs(),
}, &domain.RealTimeProvider{})
// Set logger for correlation service
@ -124,6 +125,27 @@ func main() {
cfg.Correlation.GetApacheAlwaysEmit(),
cfg.Correlation.GetApacheEmitDelayMs()))
// Start metrics server if enabled
var metricsServer *observability.MetricsServer
if cfg.Metrics.Enabled {
addr := cfg.Metrics.Addr
if addr == "" {
addr = ":8080" // Default address
}
var err error
metricsServer, err = observability.NewMetricsServer(addr, correlationSvc.GetMetricsSnapshot)
if err != nil {
logger.Error("Failed to create metrics server", err)
os.Exit(1)
}
if err := metricsServer.Start(); err != nil {
logger.Error("Failed to start metrics server", err)
os.Exit(1)
}
logger.Info(fmt.Sprintf("Metrics server started: addr=%s", metricsServer.Addr()))
logger.Info("Metrics endpoints: /metrics (JSON), /health")
}
// Create orchestrator
orchestrator := app.NewOrchestrator(app.OrchestratorConfig{
Sources: sources,
@ -166,5 +188,14 @@ func main() {
logger.Error("Error during shutdown", err)
}
// Stop metrics server
if metricsServer != nil {
shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
if err := metricsServer.Stop(shutdownCtx); err != nil {
logger.Error("Error stopping metrics server", err)
}
}
logger.Info("logcorrelator stopped")
}