feat: observability, IP filtering, stdout/clickhouse fixes (v1.1.11)
- feat(observability): metrics server with /metrics and /health endpoints - feat(observability): correlation metrics (events, success/failed, reasons, buffers) - feat(correlation): IP exclusion filter (exact IPs and CIDR ranges) - feat(correlation): pending orphan delay for late-arriving B events - fix(stdout): sink is now a no-op for data; JSON must never appear on stdout - fix(clickhouse): all flush errors were silently discarded, now properly logged - fix(clickhouse): buffer overflow with DropOnOverflow now logged at WARN - fix(clickhouse): retry attempts logged at WARN with attempt/delay/error context - feat(clickhouse): connection success logged at INFO, batch sends at DEBUG - feat(clickhouse): SetLogger() for external logger injection - test(stdout): assert stdout remains empty for correlated and orphan logs - chore(rpm): bump version to 1.1.11, update changelog - docs: README and architecture.yml updated Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@ -12,6 +12,7 @@ import (
|
||||
|
||||
"github.com/ClickHouse/clickhouse-go/v2"
|
||||
"github.com/logcorrelator/logcorrelator/internal/domain"
|
||||
"github.com/logcorrelator/logcorrelator/internal/observability"
|
||||
)
|
||||
|
||||
const (
|
||||
@ -53,6 +54,12 @@ type ClickHouseSink struct {
|
||||
done chan struct{}
|
||||
wg sync.WaitGroup
|
||||
closeOnce sync.Once
|
||||
logger *observability.Logger
|
||||
}
|
||||
|
||||
// SetLogger sets the logger used by the sink.
|
||||
func (s *ClickHouseSink) SetLogger(logger *observability.Logger) {
|
||||
s.logger = logger.WithFields(map[string]any{"sink": "clickhouse"})
|
||||
}
|
||||
|
||||
// NewClickHouseSink creates a new ClickHouse sink.
|
||||
@ -83,6 +90,7 @@ func NewClickHouseSink(config Config) (*ClickHouseSink, error) {
|
||||
buffer: make([]domain.CorrelatedLog, 0, config.BatchSize),
|
||||
flushChan: make(chan struct{}, 1),
|
||||
done: make(chan struct{}),
|
||||
logger: observability.NewLogger("clickhouse"),
|
||||
}
|
||||
|
||||
// Parse DSN and create options
|
||||
@ -107,6 +115,8 @@ func NewClickHouseSink(config Config) (*ClickHouseSink, error) {
|
||||
}
|
||||
|
||||
s.conn = conn
|
||||
s.log().Infof("connected to ClickHouse: table=%s batch_size=%d flush_interval_ms=%d",
|
||||
config.Table, config.BatchSize, config.FlushIntervalMs)
|
||||
|
||||
// Start flush goroutine
|
||||
s.wg.Add(1)
|
||||
@ -120,6 +130,14 @@ func (s *ClickHouseSink) Name() string {
|
||||
return "clickhouse"
|
||||
}
|
||||
|
||||
// log returns the logger, initializing a default one if not set (e.g. in tests).
|
||||
func (s *ClickHouseSink) log() *observability.Logger {
|
||||
if s.logger == nil {
|
||||
s.logger = observability.NewLogger("clickhouse")
|
||||
}
|
||||
return s.logger
|
||||
}
|
||||
|
||||
// Reopen is a no-op for ClickHouse (connection is managed internally).
|
||||
func (s *ClickHouseSink) Reopen() error {
|
||||
return nil
|
||||
@ -146,6 +164,7 @@ func (s *ClickHouseSink) Write(ctx context.Context, log domain.CorrelatedLog) er
|
||||
s.mu.Unlock()
|
||||
|
||||
if drop {
|
||||
s.log().Warnf("buffer full, dropping log: table=%s buffer_size=%d", s.config.Table, s.config.MaxBufferSize)
|
||||
return nil
|
||||
}
|
||||
if time.Now().After(deadline) {
|
||||
@ -201,7 +220,9 @@ func (s *ClickHouseSink) flushLoop() {
|
||||
select {
|
||||
case <-s.done:
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(s.config.TimeoutMs)*time.Millisecond)
|
||||
_ = s.doFlush(ctx)
|
||||
if err := s.doFlush(ctx); err != nil {
|
||||
s.log().Error("final flush on close failed", err)
|
||||
}
|
||||
cancel()
|
||||
return
|
||||
|
||||
@ -212,7 +233,9 @@ func (s *ClickHouseSink) flushLoop() {
|
||||
|
||||
if needsFlush {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(s.config.TimeoutMs)*time.Millisecond)
|
||||
_ = s.doFlush(ctx)
|
||||
if err := s.doFlush(ctx); err != nil {
|
||||
s.log().Error("periodic flush failed", err)
|
||||
}
|
||||
cancel()
|
||||
}
|
||||
|
||||
@ -223,7 +246,9 @@ func (s *ClickHouseSink) flushLoop() {
|
||||
|
||||
if needsFlush {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(s.config.TimeoutMs)*time.Millisecond)
|
||||
_ = s.doFlush(ctx)
|
||||
if err := s.doFlush(ctx); err != nil {
|
||||
s.log().Error("batch flush failed", err)
|
||||
}
|
||||
cancel()
|
||||
}
|
||||
}
|
||||
@ -247,7 +272,6 @@ func (s *ClickHouseSink) doFlush(ctx context.Context) error {
|
||||
return fmt.Errorf("clickhouse connection is not initialized")
|
||||
}
|
||||
|
||||
// Log batch info before sending
|
||||
batchSize := len(buffer)
|
||||
|
||||
// Retry logic with exponential backoff
|
||||
@ -255,6 +279,8 @@ func (s *ClickHouseSink) doFlush(ctx context.Context) error {
|
||||
for attempt := 0; attempt < MaxRetries; attempt++ {
|
||||
if attempt > 0 {
|
||||
delay := RetryBaseDelay * time.Duration(1<<uint(attempt-1))
|
||||
s.log().Warnf("retrying batch insert: attempt=%d/%d delay=%s rows=%d err=%v",
|
||||
attempt+1, MaxRetries, delay, batchSize, lastErr)
|
||||
select {
|
||||
case <-time.After(delay):
|
||||
case <-ctx.Done():
|
||||
@ -264,6 +290,7 @@ func (s *ClickHouseSink) doFlush(ctx context.Context) error {
|
||||
|
||||
lastErr = s.executeBatch(ctx, buffer)
|
||||
if lastErr == nil {
|
||||
s.log().Debugf("batch sent: rows=%d table=%s", batchSize, s.config.Table)
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user