feat: observability, IP filtering, stdout/clickhouse fixes (v1.1.11)

- feat(observability): metrics server with /metrics and /health endpoints
- feat(observability): correlation metrics (events, success/failed, reasons, buffers)
- feat(correlation): IP exclusion filter (exact IPs and CIDR ranges)
- feat(correlation): pending orphan delay for late-arriving B events
- fix(stdout): sink is now a no-op for data; JSON must never appear on stdout
- fix(clickhouse): all flush errors were silently discarded, now properly logged
- fix(clickhouse): buffer overflow with DropOnOverflow now logged at WARN
- fix(clickhouse): retry attempts logged at WARN with attempt/delay/error context
- feat(clickhouse): connection success logged at INFO, batch sends at DEBUG
- feat(clickhouse): SetLogger() for external logger injection
- test(stdout): assert stdout remains empty for correlated and orphan logs
- chore(rpm): bump version to 1.1.11, update changelog
- docs: README and architecture.yml updated

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
toto
2026-03-05 11:40:54 +01:00
parent 68f0fcf810
commit e9dcd8ea51
16 changed files with 2035 additions and 116 deletions

View File

@ -164,9 +164,9 @@ func (s *UnixSocketSource) readDatagrams(ctx context.Context, eventChan chan<- *
continue
}
// Debug: log raw events
s.logger.Debugf("event received: source=%s src_ip=%s src_port=%d",
event.Source, event.SrcIP, event.SrcPort)
// Debug: log raw events with all key details
s.logger.Debugf("event received: source=%s src_ip=%s src_port=%d timestamp=%v raw_timestamp=%v",
event.Source, event.SrcIP, event.SrcPort, event.Timestamp, event.Raw["timestamp"])
select {
case eventChan <- event:

View File

@ -12,6 +12,7 @@ import (
"github.com/ClickHouse/clickhouse-go/v2"
"github.com/logcorrelator/logcorrelator/internal/domain"
"github.com/logcorrelator/logcorrelator/internal/observability"
)
const (
@ -53,6 +54,12 @@ type ClickHouseSink struct {
done chan struct{}
wg sync.WaitGroup
closeOnce sync.Once
logger *observability.Logger
}
// SetLogger sets the logger used by the sink.
func (s *ClickHouseSink) SetLogger(logger *observability.Logger) {
s.logger = logger.WithFields(map[string]any{"sink": "clickhouse"})
}
// NewClickHouseSink creates a new ClickHouse sink.
@ -83,6 +90,7 @@ func NewClickHouseSink(config Config) (*ClickHouseSink, error) {
buffer: make([]domain.CorrelatedLog, 0, config.BatchSize),
flushChan: make(chan struct{}, 1),
done: make(chan struct{}),
logger: observability.NewLogger("clickhouse"),
}
// Parse DSN and create options
@ -107,6 +115,8 @@ func NewClickHouseSink(config Config) (*ClickHouseSink, error) {
}
s.conn = conn
s.log().Infof("connected to ClickHouse: table=%s batch_size=%d flush_interval_ms=%d",
config.Table, config.BatchSize, config.FlushIntervalMs)
// Start flush goroutine
s.wg.Add(1)
@ -120,6 +130,14 @@ func (s *ClickHouseSink) Name() string {
return "clickhouse"
}
// log returns the logger, initializing a default one if not set (e.g. in tests).
func (s *ClickHouseSink) log() *observability.Logger {
if s.logger == nil {
s.logger = observability.NewLogger("clickhouse")
}
return s.logger
}
// Reopen is a no-op for ClickHouse (connection is managed internally).
func (s *ClickHouseSink) Reopen() error {
return nil
@ -146,6 +164,7 @@ func (s *ClickHouseSink) Write(ctx context.Context, log domain.CorrelatedLog) er
s.mu.Unlock()
if drop {
s.log().Warnf("buffer full, dropping log: table=%s buffer_size=%d", s.config.Table, s.config.MaxBufferSize)
return nil
}
if time.Now().After(deadline) {
@ -201,7 +220,9 @@ func (s *ClickHouseSink) flushLoop() {
select {
case <-s.done:
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(s.config.TimeoutMs)*time.Millisecond)
_ = s.doFlush(ctx)
if err := s.doFlush(ctx); err != nil {
s.log().Error("final flush on close failed", err)
}
cancel()
return
@ -212,7 +233,9 @@ func (s *ClickHouseSink) flushLoop() {
if needsFlush {
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(s.config.TimeoutMs)*time.Millisecond)
_ = s.doFlush(ctx)
if err := s.doFlush(ctx); err != nil {
s.log().Error("periodic flush failed", err)
}
cancel()
}
@ -223,7 +246,9 @@ func (s *ClickHouseSink) flushLoop() {
if needsFlush {
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(s.config.TimeoutMs)*time.Millisecond)
_ = s.doFlush(ctx)
if err := s.doFlush(ctx); err != nil {
s.log().Error("batch flush failed", err)
}
cancel()
}
}
@ -247,7 +272,6 @@ func (s *ClickHouseSink) doFlush(ctx context.Context) error {
return fmt.Errorf("clickhouse connection is not initialized")
}
// Log batch info before sending
batchSize := len(buffer)
// Retry logic with exponential backoff
@ -255,6 +279,8 @@ func (s *ClickHouseSink) doFlush(ctx context.Context) error {
for attempt := 0; attempt < MaxRetries; attempt++ {
if attempt > 0 {
delay := RetryBaseDelay * time.Duration(1<<uint(attempt-1))
s.log().Warnf("retrying batch insert: attempt=%d/%d delay=%s rows=%d err=%v",
attempt+1, MaxRetries, delay, batchSize, lastErr)
select {
case <-time.After(delay):
case <-ctx.Done():
@ -264,6 +290,7 @@ func (s *ClickHouseSink) doFlush(ctx context.Context) error {
lastErr = s.executeBatch(ctx, buffer)
if lastErr == nil {
s.log().Debugf("batch sent: rows=%d table=%s", batchSize, s.config.Table)
return nil
}

View File

@ -2,41 +2,22 @@ package stdout
import (
"context"
"encoding/json"
"fmt"
"os"
"strings"
"sync"
"github.com/logcorrelator/logcorrelator/internal/domain"
"github.com/logcorrelator/logcorrelator/internal/observability"
)
// Config holds the stdout sink configuration.
type Config struct {
Enabled bool
Level string // DEBUG, INFO, WARN, ERROR - filters output verbosity
}
// StdoutSink writes correlated logs to stdout as JSON lines.
type StdoutSink struct {
config Config
mu sync.Mutex
enc *json.Encoder
minLevel observability.LogLevel
}
// StdoutSink is a no-op data sink. Operational logs are written to stderr
// by the observability.Logger; correlated data must never appear on stdout.
type StdoutSink struct{}
// NewStdoutSink creates a new stdout sink.
func NewStdoutSink(config Config) *StdoutSink {
level := observability.INFO
if config.Level != "" {
level = observability.ParseLogLevel(strings.ToUpper(config.Level))
}
return &StdoutSink{
config: config,
enc: json.NewEncoder(os.Stdout),
minLevel: level,
}
return &StdoutSink{}
}
// Name returns the sink name.
@ -49,50 +30,17 @@ func (s *StdoutSink) Reopen() error {
return nil
}
// Write writes a correlated log to stdout.
func (s *StdoutSink) Write(ctx context.Context, log domain.CorrelatedLog) error {
s.mu.Lock()
defer s.mu.Unlock()
// Filter output based on correlation status and log level
// DEBUG: all logs, INFO: only correlated, WARN: only correlated with issues, ERROR: none
if !s.shouldEmit(log) {
return nil
}
if err := s.enc.Encode(log); err != nil {
return fmt.Errorf("failed to write to stdout: %w", err)
}
// Write is a no-op: correlated data must never be written to stdout.
func (s *StdoutSink) Write(_ context.Context, _ domain.CorrelatedLog) error {
return nil
}
// shouldEmit determines if a log should be emitted based on its correlation status and configured level.
func (s *StdoutSink) shouldEmit(log domain.CorrelatedLog) bool {
switch s.minLevel {
case observability.DEBUG:
// Emit everything including orphans
return true
case observability.INFO:
// Emit all correlated logs, skip orphans
return log.Correlated
case observability.WARN:
// Emit only correlated logs (same as INFO for now, can be extended)
return log.Correlated
case observability.ERROR:
// Never emit to stdout at ERROR level
return false
default:
return log.Correlated
}
}
// Flush flushes any buffered data (no-op for stdout).
func (s *StdoutSink) Flush(ctx context.Context) error {
// Flush is a no-op for stdout.
func (s *StdoutSink) Flush(_ context.Context) error {
return nil
}
// Close closes the sink (no-op for stdout).
// Close is a no-op for stdout.
func (s *StdoutSink) Close() error {
return nil
}

View File

@ -0,0 +1,81 @@
package stdout
import (
"bytes"
"context"
"os"
"testing"
"time"
"github.com/logcorrelator/logcorrelator/internal/domain"
)
func makeLog(correlated bool) domain.CorrelatedLog {
return domain.CorrelatedLog{
Timestamp: time.Unix(1700000000, 0),
SrcIP: "1.2.3.4",
SrcPort: 12345,
Correlated: correlated,
}
}
// captureStdout replaces os.Stdout temporarily and returns what was written.
func captureStdout(t *testing.T, fn func()) string {
t.Helper()
r, w, err := os.Pipe()
if err != nil {
t.Fatalf("os.Pipe: %v", err)
}
old := os.Stdout
os.Stdout = w
fn()
w.Close()
os.Stdout = old
var buf bytes.Buffer
buf.ReadFrom(r)
r.Close()
return buf.String()
}
func TestStdoutSink_Name(t *testing.T) {
s := NewStdoutSink(Config{Enabled: true})
if s.Name() != "stdout" {
t.Errorf("expected name 'stdout', got %q", s.Name())
}
}
// TestStdoutSink_WriteDoesNotProduceOutput verifies that no JSON data
// (correlated or not) is ever written to stdout.
func TestStdoutSink_WriteDoesNotProduceOutput(t *testing.T) {
s := NewStdoutSink(Config{Enabled: true})
got := captureStdout(t, func() {
if err := s.Write(context.Background(), makeLog(true)); err != nil {
t.Fatalf("Write(correlated) returned error: %v", err)
}
if err := s.Write(context.Background(), makeLog(false)); err != nil {
t.Fatalf("Write(orphan) returned error: %v", err)
}
})
if got != "" {
t.Errorf("stdout must be empty but got: %q", got)
}
}
func TestStdoutSink_NoopMethods(t *testing.T) {
s := NewStdoutSink(Config{Enabled: true})
if err := s.Flush(context.Background()); err != nil {
t.Errorf("Flush returned error: %v", err)
}
if err := s.Close(); err != nil {
t.Errorf("Close returned error: %v", err)
}
if err := s.Reopen(); err != nil {
t.Errorf("Reopen returned error: %v", err)
}
}