feat: observability, IP filtering, stdout/clickhouse fixes (v1.1.11)
- feat(observability): metrics server with /metrics and /health endpoints - feat(observability): correlation metrics (events, success/failed, reasons, buffers) - feat(correlation): IP exclusion filter (exact IPs and CIDR ranges) - feat(correlation): pending orphan delay for late-arriving B events - fix(stdout): sink is now a no-op for data; JSON must never appear on stdout - fix(clickhouse): all flush errors were silently discarded, now properly logged - fix(clickhouse): buffer overflow with DropOnOverflow now logged at WARN - fix(clickhouse): retry attempts logged at WARN with attempt/delay/error context - feat(clickhouse): connection success logged at INFO, batch sends at DEBUG - feat(clickhouse): SetLogger() for external logger injection - test(stdout): assert stdout remains empty for correlated and orphan logs - chore(rpm): bump version to 1.1.11, update changelog - docs: README and architecture.yml updated Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@ -164,9 +164,9 @@ func (s *UnixSocketSource) readDatagrams(ctx context.Context, eventChan chan<- *
|
||||
continue
|
||||
}
|
||||
|
||||
// Debug: log raw events
|
||||
s.logger.Debugf("event received: source=%s src_ip=%s src_port=%d",
|
||||
event.Source, event.SrcIP, event.SrcPort)
|
||||
// Debug: log raw events with all key details
|
||||
s.logger.Debugf("event received: source=%s src_ip=%s src_port=%d timestamp=%v raw_timestamp=%v",
|
||||
event.Source, event.SrcIP, event.SrcPort, event.Timestamp, event.Raw["timestamp"])
|
||||
|
||||
select {
|
||||
case eventChan <- event:
|
||||
|
||||
@ -12,6 +12,7 @@ import (
|
||||
|
||||
"github.com/ClickHouse/clickhouse-go/v2"
|
||||
"github.com/logcorrelator/logcorrelator/internal/domain"
|
||||
"github.com/logcorrelator/logcorrelator/internal/observability"
|
||||
)
|
||||
|
||||
const (
|
||||
@ -53,6 +54,12 @@ type ClickHouseSink struct {
|
||||
done chan struct{}
|
||||
wg sync.WaitGroup
|
||||
closeOnce sync.Once
|
||||
logger *observability.Logger
|
||||
}
|
||||
|
||||
// SetLogger sets the logger used by the sink.
|
||||
func (s *ClickHouseSink) SetLogger(logger *observability.Logger) {
|
||||
s.logger = logger.WithFields(map[string]any{"sink": "clickhouse"})
|
||||
}
|
||||
|
||||
// NewClickHouseSink creates a new ClickHouse sink.
|
||||
@ -83,6 +90,7 @@ func NewClickHouseSink(config Config) (*ClickHouseSink, error) {
|
||||
buffer: make([]domain.CorrelatedLog, 0, config.BatchSize),
|
||||
flushChan: make(chan struct{}, 1),
|
||||
done: make(chan struct{}),
|
||||
logger: observability.NewLogger("clickhouse"),
|
||||
}
|
||||
|
||||
// Parse DSN and create options
|
||||
@ -107,6 +115,8 @@ func NewClickHouseSink(config Config) (*ClickHouseSink, error) {
|
||||
}
|
||||
|
||||
s.conn = conn
|
||||
s.log().Infof("connected to ClickHouse: table=%s batch_size=%d flush_interval_ms=%d",
|
||||
config.Table, config.BatchSize, config.FlushIntervalMs)
|
||||
|
||||
// Start flush goroutine
|
||||
s.wg.Add(1)
|
||||
@ -120,6 +130,14 @@ func (s *ClickHouseSink) Name() string {
|
||||
return "clickhouse"
|
||||
}
|
||||
|
||||
// log returns the logger, initializing a default one if not set (e.g. in tests).
|
||||
func (s *ClickHouseSink) log() *observability.Logger {
|
||||
if s.logger == nil {
|
||||
s.logger = observability.NewLogger("clickhouse")
|
||||
}
|
||||
return s.logger
|
||||
}
|
||||
|
||||
// Reopen is a no-op for ClickHouse (connection is managed internally).
|
||||
func (s *ClickHouseSink) Reopen() error {
|
||||
return nil
|
||||
@ -146,6 +164,7 @@ func (s *ClickHouseSink) Write(ctx context.Context, log domain.CorrelatedLog) er
|
||||
s.mu.Unlock()
|
||||
|
||||
if drop {
|
||||
s.log().Warnf("buffer full, dropping log: table=%s buffer_size=%d", s.config.Table, s.config.MaxBufferSize)
|
||||
return nil
|
||||
}
|
||||
if time.Now().After(deadline) {
|
||||
@ -201,7 +220,9 @@ func (s *ClickHouseSink) flushLoop() {
|
||||
select {
|
||||
case <-s.done:
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(s.config.TimeoutMs)*time.Millisecond)
|
||||
_ = s.doFlush(ctx)
|
||||
if err := s.doFlush(ctx); err != nil {
|
||||
s.log().Error("final flush on close failed", err)
|
||||
}
|
||||
cancel()
|
||||
return
|
||||
|
||||
@ -212,7 +233,9 @@ func (s *ClickHouseSink) flushLoop() {
|
||||
|
||||
if needsFlush {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(s.config.TimeoutMs)*time.Millisecond)
|
||||
_ = s.doFlush(ctx)
|
||||
if err := s.doFlush(ctx); err != nil {
|
||||
s.log().Error("periodic flush failed", err)
|
||||
}
|
||||
cancel()
|
||||
}
|
||||
|
||||
@ -223,7 +246,9 @@ func (s *ClickHouseSink) flushLoop() {
|
||||
|
||||
if needsFlush {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(s.config.TimeoutMs)*time.Millisecond)
|
||||
_ = s.doFlush(ctx)
|
||||
if err := s.doFlush(ctx); err != nil {
|
||||
s.log().Error("batch flush failed", err)
|
||||
}
|
||||
cancel()
|
||||
}
|
||||
}
|
||||
@ -247,7 +272,6 @@ func (s *ClickHouseSink) doFlush(ctx context.Context) error {
|
||||
return fmt.Errorf("clickhouse connection is not initialized")
|
||||
}
|
||||
|
||||
// Log batch info before sending
|
||||
batchSize := len(buffer)
|
||||
|
||||
// Retry logic with exponential backoff
|
||||
@ -255,6 +279,8 @@ func (s *ClickHouseSink) doFlush(ctx context.Context) error {
|
||||
for attempt := 0; attempt < MaxRetries; attempt++ {
|
||||
if attempt > 0 {
|
||||
delay := RetryBaseDelay * time.Duration(1<<uint(attempt-1))
|
||||
s.log().Warnf("retrying batch insert: attempt=%d/%d delay=%s rows=%d err=%v",
|
||||
attempt+1, MaxRetries, delay, batchSize, lastErr)
|
||||
select {
|
||||
case <-time.After(delay):
|
||||
case <-ctx.Done():
|
||||
@ -264,6 +290,7 @@ func (s *ClickHouseSink) doFlush(ctx context.Context) error {
|
||||
|
||||
lastErr = s.executeBatch(ctx, buffer)
|
||||
if lastErr == nil {
|
||||
s.log().Debugf("batch sent: rows=%d table=%s", batchSize, s.config.Table)
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
@ -2,41 +2,22 @@ package stdout
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/logcorrelator/logcorrelator/internal/domain"
|
||||
"github.com/logcorrelator/logcorrelator/internal/observability"
|
||||
)
|
||||
|
||||
// Config holds the stdout sink configuration.
|
||||
type Config struct {
|
||||
Enabled bool
|
||||
Level string // DEBUG, INFO, WARN, ERROR - filters output verbosity
|
||||
}
|
||||
|
||||
// StdoutSink writes correlated logs to stdout as JSON lines.
|
||||
type StdoutSink struct {
|
||||
config Config
|
||||
mu sync.Mutex
|
||||
enc *json.Encoder
|
||||
minLevel observability.LogLevel
|
||||
}
|
||||
// StdoutSink is a no-op data sink. Operational logs are written to stderr
|
||||
// by the observability.Logger; correlated data must never appear on stdout.
|
||||
type StdoutSink struct{}
|
||||
|
||||
// NewStdoutSink creates a new stdout sink.
|
||||
func NewStdoutSink(config Config) *StdoutSink {
|
||||
level := observability.INFO
|
||||
if config.Level != "" {
|
||||
level = observability.ParseLogLevel(strings.ToUpper(config.Level))
|
||||
}
|
||||
return &StdoutSink{
|
||||
config: config,
|
||||
enc: json.NewEncoder(os.Stdout),
|
||||
minLevel: level,
|
||||
}
|
||||
return &StdoutSink{}
|
||||
}
|
||||
|
||||
// Name returns the sink name.
|
||||
@ -49,50 +30,17 @@ func (s *StdoutSink) Reopen() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Write writes a correlated log to stdout.
|
||||
func (s *StdoutSink) Write(ctx context.Context, log domain.CorrelatedLog) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
|
||||
// Filter output based on correlation status and log level
|
||||
// DEBUG: all logs, INFO: only correlated, WARN: only correlated with issues, ERROR: none
|
||||
if !s.shouldEmit(log) {
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := s.enc.Encode(log); err != nil {
|
||||
return fmt.Errorf("failed to write to stdout: %w", err)
|
||||
}
|
||||
|
||||
// Write is a no-op: correlated data must never be written to stdout.
|
||||
func (s *StdoutSink) Write(_ context.Context, _ domain.CorrelatedLog) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// shouldEmit determines if a log should be emitted based on its correlation status and configured level.
|
||||
func (s *StdoutSink) shouldEmit(log domain.CorrelatedLog) bool {
|
||||
switch s.minLevel {
|
||||
case observability.DEBUG:
|
||||
// Emit everything including orphans
|
||||
return true
|
||||
case observability.INFO:
|
||||
// Emit all correlated logs, skip orphans
|
||||
return log.Correlated
|
||||
case observability.WARN:
|
||||
// Emit only correlated logs (same as INFO for now, can be extended)
|
||||
return log.Correlated
|
||||
case observability.ERROR:
|
||||
// Never emit to stdout at ERROR level
|
||||
return false
|
||||
default:
|
||||
return log.Correlated
|
||||
}
|
||||
}
|
||||
|
||||
// Flush flushes any buffered data (no-op for stdout).
|
||||
func (s *StdoutSink) Flush(ctx context.Context) error {
|
||||
// Flush is a no-op for stdout.
|
||||
func (s *StdoutSink) Flush(_ context.Context) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Close closes the sink (no-op for stdout).
|
||||
// Close is a no-op for stdout.
|
||||
func (s *StdoutSink) Close() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
81
internal/adapters/outbound/stdout/sink_test.go
Normal file
81
internal/adapters/outbound/stdout/sink_test.go
Normal file
@ -0,0 +1,81 @@
|
||||
package stdout
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"os"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/logcorrelator/logcorrelator/internal/domain"
|
||||
)
|
||||
|
||||
func makeLog(correlated bool) domain.CorrelatedLog {
|
||||
return domain.CorrelatedLog{
|
||||
Timestamp: time.Unix(1700000000, 0),
|
||||
SrcIP: "1.2.3.4",
|
||||
SrcPort: 12345,
|
||||
Correlated: correlated,
|
||||
}
|
||||
}
|
||||
|
||||
// captureStdout replaces os.Stdout temporarily and returns what was written.
|
||||
func captureStdout(t *testing.T, fn func()) string {
|
||||
t.Helper()
|
||||
r, w, err := os.Pipe()
|
||||
if err != nil {
|
||||
t.Fatalf("os.Pipe: %v", err)
|
||||
}
|
||||
old := os.Stdout
|
||||
os.Stdout = w
|
||||
|
||||
fn()
|
||||
|
||||
w.Close()
|
||||
os.Stdout = old
|
||||
|
||||
var buf bytes.Buffer
|
||||
buf.ReadFrom(r)
|
||||
r.Close()
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
func TestStdoutSink_Name(t *testing.T) {
|
||||
s := NewStdoutSink(Config{Enabled: true})
|
||||
if s.Name() != "stdout" {
|
||||
t.Errorf("expected name 'stdout', got %q", s.Name())
|
||||
}
|
||||
}
|
||||
|
||||
// TestStdoutSink_WriteDoesNotProduceOutput verifies that no JSON data
|
||||
// (correlated or not) is ever written to stdout.
|
||||
func TestStdoutSink_WriteDoesNotProduceOutput(t *testing.T) {
|
||||
s := NewStdoutSink(Config{Enabled: true})
|
||||
|
||||
got := captureStdout(t, func() {
|
||||
if err := s.Write(context.Background(), makeLog(true)); err != nil {
|
||||
t.Fatalf("Write(correlated) returned error: %v", err)
|
||||
}
|
||||
if err := s.Write(context.Background(), makeLog(false)); err != nil {
|
||||
t.Fatalf("Write(orphan) returned error: %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
if got != "" {
|
||||
t.Errorf("stdout must be empty but got: %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestStdoutSink_NoopMethods(t *testing.T) {
|
||||
s := NewStdoutSink(Config{Enabled: true})
|
||||
|
||||
if err := s.Flush(context.Background()); err != nil {
|
||||
t.Errorf("Flush returned error: %v", err)
|
||||
}
|
||||
if err := s.Close(); err != nil {
|
||||
t.Errorf("Close returned error: %v", err)
|
||||
}
|
||||
if err := s.Reopen(); err != nil {
|
||||
t.Errorf("Reopen returned error: %v", err)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user