feat: observability, IP filtering, stdout/clickhouse fixes (v1.1.11)

- feat(observability): metrics server with /metrics and /health endpoints
- feat(observability): correlation metrics (events, success/failed, reasons, buffers)
- feat(correlation): IP exclusion filter (exact IPs and CIDR ranges)
- feat(correlation): pending orphan delay for late-arriving B events
- fix(stdout): sink is now a no-op for data; JSON must never appear on stdout
- fix(clickhouse): all flush errors were silently discarded, now properly logged
- fix(clickhouse): buffer overflow with DropOnOverflow now logged at WARN
- fix(clickhouse): retry attempts logged at WARN with attempt/delay/error context
- feat(clickhouse): connection success logged at INFO, batch sends at DEBUG
- feat(clickhouse): SetLogger() for external logger injection
- test(stdout): assert stdout remains empty for correlated and orphan logs
- chore(rpm): bump version to 1.1.11, update changelog
- docs: README and architecture.yml updated

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
toto
2026-03-05 11:40:54 +01:00
parent 68f0fcf810
commit e9dcd8ea51
16 changed files with 2035 additions and 116 deletions

View File

@ -12,6 +12,7 @@ import (
"github.com/ClickHouse/clickhouse-go/v2"
"github.com/logcorrelator/logcorrelator/internal/domain"
"github.com/logcorrelator/logcorrelator/internal/observability"
)
const (
@ -53,6 +54,12 @@ type ClickHouseSink struct {
done chan struct{}
wg sync.WaitGroup
closeOnce sync.Once
logger *observability.Logger
}
// SetLogger sets the logger used by the sink.
func (s *ClickHouseSink) SetLogger(logger *observability.Logger) {
s.logger = logger.WithFields(map[string]any{"sink": "clickhouse"})
}
// NewClickHouseSink creates a new ClickHouse sink.
@ -83,6 +90,7 @@ func NewClickHouseSink(config Config) (*ClickHouseSink, error) {
buffer: make([]domain.CorrelatedLog, 0, config.BatchSize),
flushChan: make(chan struct{}, 1),
done: make(chan struct{}),
logger: observability.NewLogger("clickhouse"),
}
// Parse DSN and create options
@ -107,6 +115,8 @@ func NewClickHouseSink(config Config) (*ClickHouseSink, error) {
}
s.conn = conn
s.log().Infof("connected to ClickHouse: table=%s batch_size=%d flush_interval_ms=%d",
config.Table, config.BatchSize, config.FlushIntervalMs)
// Start flush goroutine
s.wg.Add(1)
@ -120,6 +130,14 @@ func (s *ClickHouseSink) Name() string {
return "clickhouse"
}
// log returns the logger, initializing a default one if not set (e.g. in tests).
func (s *ClickHouseSink) log() *observability.Logger {
if s.logger == nil {
s.logger = observability.NewLogger("clickhouse")
}
return s.logger
}
// Reopen is a no-op for ClickHouse (connection is managed internally).
func (s *ClickHouseSink) Reopen() error {
return nil
@ -146,6 +164,7 @@ func (s *ClickHouseSink) Write(ctx context.Context, log domain.CorrelatedLog) er
s.mu.Unlock()
if drop {
s.log().Warnf("buffer full, dropping log: table=%s buffer_size=%d", s.config.Table, s.config.MaxBufferSize)
return nil
}
if time.Now().After(deadline) {
@ -201,7 +220,9 @@ func (s *ClickHouseSink) flushLoop() {
select {
case <-s.done:
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(s.config.TimeoutMs)*time.Millisecond)
_ = s.doFlush(ctx)
if err := s.doFlush(ctx); err != nil {
s.log().Error("final flush on close failed", err)
}
cancel()
return
@ -212,7 +233,9 @@ func (s *ClickHouseSink) flushLoop() {
if needsFlush {
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(s.config.TimeoutMs)*time.Millisecond)
_ = s.doFlush(ctx)
if err := s.doFlush(ctx); err != nil {
s.log().Error("periodic flush failed", err)
}
cancel()
}
@ -223,7 +246,9 @@ func (s *ClickHouseSink) flushLoop() {
if needsFlush {
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(s.config.TimeoutMs)*time.Millisecond)
_ = s.doFlush(ctx)
if err := s.doFlush(ctx); err != nil {
s.log().Error("batch flush failed", err)
}
cancel()
}
}
@ -247,7 +272,6 @@ func (s *ClickHouseSink) doFlush(ctx context.Context) error {
return fmt.Errorf("clickhouse connection is not initialized")
}
// Log batch info before sending
batchSize := len(buffer)
// Retry logic with exponential backoff
@ -255,6 +279,8 @@ func (s *ClickHouseSink) doFlush(ctx context.Context) error {
for attempt := 0; attempt < MaxRetries; attempt++ {
if attempt > 0 {
delay := RetryBaseDelay * time.Duration(1<<uint(attempt-1))
s.log().Warnf("retrying batch insert: attempt=%d/%d delay=%s rows=%d err=%v",
attempt+1, MaxRetries, delay, batchSize, lastErr)
select {
case <-time.After(delay):
case <-ctx.Done():
@ -264,6 +290,7 @@ func (s *ClickHouseSink) doFlush(ctx context.Context) error {
lastErr = s.executeBatch(ctx, buffer)
if lastErr == nil {
s.log().Debugf("batch sent: rows=%d table=%s", batchSize, s.config.Table)
return nil
}