fix: durcir la validation et fiabiliser flush/arrêt idempotents
Co-authored-by: aider (openrouter/openai/gpt-5.3-codex) <aider@aider.chat>
This commit is contained in:
@ -40,6 +40,7 @@ type UnixSocketSource struct {
|
||||
done chan struct{}
|
||||
wg sync.WaitGroup
|
||||
semaphore chan struct{} // Limit concurrent connections
|
||||
stopOnce sync.Once
|
||||
}
|
||||
|
||||
// NewUnixSocketSource creates a new Unix socket source.
|
||||
@ -58,6 +59,10 @@ func (s *UnixSocketSource) Name() string {
|
||||
|
||||
// Start begins listening on the Unix socket.
|
||||
func (s *UnixSocketSource) Start(ctx context.Context, eventChan chan<- *domain.NormalizedEvent) error {
|
||||
if strings.TrimSpace(s.config.Path) == "" {
|
||||
return fmt.Errorf("socket path cannot be empty")
|
||||
}
|
||||
|
||||
// Remove existing socket file if present
|
||||
if info, err := os.Stat(s.config.Path); err == nil {
|
||||
if info.Mode()&os.ModeSocket != 0 {
|
||||
@ -78,8 +83,8 @@ func (s *UnixSocketSource) Start(ctx context.Context, eventChan chan<- *domain.N
|
||||
|
||||
// Set permissions - fail if we can't
|
||||
if err := os.Chmod(s.config.Path, DefaultSocketPermissions); err != nil {
|
||||
listener.Close()
|
||||
os.Remove(s.config.Path)
|
||||
_ = listener.Close()
|
||||
_ = os.Remove(s.config.Path)
|
||||
return fmt.Errorf("failed to set socket permissions: %w", err)
|
||||
}
|
||||
|
||||
@ -120,7 +125,7 @@ func (s *UnixSocketSource) acceptConnections(ctx context.Context, eventChan chan
|
||||
// Connection accepted
|
||||
default:
|
||||
// Too many connections, reject
|
||||
conn.Close()
|
||||
_ = conn.Close()
|
||||
continue
|
||||
}
|
||||
|
||||
@ -136,7 +141,7 @@ func (s *UnixSocketSource) acceptConnections(ctx context.Context, eventChan chan
|
||||
|
||||
func (s *UnixSocketSource) readEvents(ctx context.Context, conn net.Conn, eventChan chan<- *domain.NormalizedEvent) {
|
||||
// Set read deadline to prevent hanging
|
||||
conn.SetReadDeadline(time.Now().Add(5 * time.Minute))
|
||||
_ = conn.SetReadDeadline(time.Now().Add(5 * time.Minute))
|
||||
|
||||
scanner := bufio.NewScanner(conn)
|
||||
// Increase buffer size limit to 1MB
|
||||
@ -167,10 +172,6 @@ func (s *UnixSocketSource) readEvents(ctx context.Context, conn net.Conn, eventC
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
if err := scanner.Err(); err != nil {
|
||||
// Connection error, log but don't crash
|
||||
}
|
||||
}
|
||||
|
||||
func parseJSONEvent(data []byte) (*domain.NormalizedEvent, error) {
|
||||
@ -314,21 +315,26 @@ func getInt64(m map[string]any, key string) (int64, bool) {
|
||||
|
||||
// Stop gracefully stops the source.
|
||||
func (s *UnixSocketSource) Stop() error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
var stopErr error
|
||||
|
||||
close(s.done)
|
||||
s.stopOnce.Do(func() {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
|
||||
if s.listener != nil {
|
||||
s.listener.Close()
|
||||
}
|
||||
close(s.done)
|
||||
|
||||
s.wg.Wait()
|
||||
if s.listener != nil {
|
||||
_ = s.listener.Close()
|
||||
}
|
||||
|
||||
// Clean up socket file
|
||||
if err := os.Remove(s.config.Path); err != nil && !os.IsNotExist(err) {
|
||||
return fmt.Errorf("failed to remove socket file: %w", err)
|
||||
}
|
||||
s.wg.Wait()
|
||||
|
||||
return nil
|
||||
// Clean up socket file
|
||||
if err := os.Remove(s.config.Path); err != nil && !os.IsNotExist(err) {
|
||||
stopErr = fmt.Errorf("failed to remove socket file: %w", err)
|
||||
return
|
||||
}
|
||||
})
|
||||
|
||||
return stopErr
|
||||
}
|
||||
|
||||
@ -5,6 +5,7 @@ import (
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
@ -49,10 +50,18 @@ type ClickHouseSink struct {
|
||||
flushChan chan struct{}
|
||||
done chan struct{}
|
||||
wg sync.WaitGroup
|
||||
closeOnce sync.Once
|
||||
}
|
||||
|
||||
// NewClickHouseSink creates a new ClickHouse sink.
|
||||
func NewClickHouseSink(config Config) (*ClickHouseSink, error) {
|
||||
if strings.TrimSpace(config.DSN) == "" {
|
||||
return nil, fmt.Errorf("clickhouse DSN is required")
|
||||
}
|
||||
if strings.TrimSpace(config.Table) == "" {
|
||||
return nil, fmt.Errorf("clickhouse table is required")
|
||||
}
|
||||
|
||||
// Apply defaults
|
||||
if config.BatchSize <= 0 {
|
||||
config.BatchSize = DefaultBatchSize
|
||||
@ -85,7 +94,7 @@ func NewClickHouseSink(config Config) (*ClickHouseSink, error) {
|
||||
defer pingCancel()
|
||||
|
||||
if err := db.PingContext(pingCtx); err != nil {
|
||||
db.Close()
|
||||
_ = db.Close()
|
||||
return nil, fmt.Errorf("failed to ping ClickHouse: %w", err)
|
||||
}
|
||||
|
||||
@ -143,13 +152,28 @@ func (s *ClickHouseSink) Flush(ctx context.Context) error {
|
||||
|
||||
// Close closes the sink.
|
||||
func (s *ClickHouseSink) Close() error {
|
||||
close(s.done)
|
||||
s.wg.Wait()
|
||||
var closeErr error
|
||||
|
||||
if s.db != nil {
|
||||
return s.db.Close()
|
||||
}
|
||||
return nil
|
||||
s.closeOnce.Do(func() {
|
||||
if s.done != nil {
|
||||
close(s.done)
|
||||
}
|
||||
s.wg.Wait()
|
||||
|
||||
flushCtx, cancel := context.WithTimeout(context.Background(), time.Duration(s.config.TimeoutMs)*time.Millisecond)
|
||||
defer cancel()
|
||||
if err := s.doFlush(flushCtx); err != nil {
|
||||
closeErr = err
|
||||
}
|
||||
|
||||
if s.db != nil {
|
||||
if err := s.db.Close(); err != nil && closeErr == nil {
|
||||
closeErr = err
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
return closeErr
|
||||
}
|
||||
|
||||
func (s *ClickHouseSink) flushLoop() {
|
||||
@ -161,25 +185,30 @@ func (s *ClickHouseSink) flushLoop() {
|
||||
for {
|
||||
select {
|
||||
case <-s.done:
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(s.config.TimeoutMs)*time.Millisecond)
|
||||
_ = s.doFlush(ctx)
|
||||
cancel()
|
||||
return
|
||||
|
||||
case <-ticker.C:
|
||||
s.mu.Lock()
|
||||
needsFlush := len(s.buffer) > 0
|
||||
s.mu.Unlock()
|
||||
|
||||
if needsFlush {
|
||||
// Use timeout context for flush
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(s.config.TimeoutMs)*time.Millisecond)
|
||||
s.doFlush(ctx)
|
||||
_ = s.doFlush(ctx)
|
||||
cancel()
|
||||
}
|
||||
|
||||
case <-s.flushChan:
|
||||
s.mu.Lock()
|
||||
needsFlush := len(s.buffer) >= s.config.BatchSize
|
||||
s.mu.Unlock()
|
||||
|
||||
if needsFlush {
|
||||
// Use timeout context for flush
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(s.config.TimeoutMs)*time.Millisecond)
|
||||
s.doFlush(ctx)
|
||||
_ = s.doFlush(ctx)
|
||||
cancel()
|
||||
}
|
||||
}
|
||||
@ -199,7 +228,10 @@ func (s *ClickHouseSink) doFlush(ctx context.Context) error {
|
||||
s.buffer = make([]domain.CorrelatedLog, 0, s.config.BatchSize)
|
||||
s.mu.Unlock()
|
||||
|
||||
// Prepare batch insert with retry
|
||||
if s.db == nil {
|
||||
return fmt.Errorf("clickhouse connection is not initialized")
|
||||
}
|
||||
|
||||
query := fmt.Sprintf(`
|
||||
INSERT INTO %s (timestamp, src_ip, src_port, dst_ip, dst_port, correlated, orphan_side, apache, network)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
@ -209,7 +241,6 @@ func (s *ClickHouseSink) doFlush(ctx context.Context) error {
|
||||
var lastErr error
|
||||
for attempt := 0; attempt < MaxRetries; attempt++ {
|
||||
if attempt > 0 {
|
||||
// Exponential backoff
|
||||
delay := RetryBaseDelay * time.Duration(1<<uint(attempt-1))
|
||||
select {
|
||||
case <-time.After(delay):
|
||||
@ -220,10 +251,9 @@ func (s *ClickHouseSink) doFlush(ctx context.Context) error {
|
||||
|
||||
lastErr = s.executeBatch(ctx, query, buffer)
|
||||
if lastErr == nil {
|
||||
return nil // Success
|
||||
return nil
|
||||
}
|
||||
|
||||
// Check if error is retryable
|
||||
if !isRetryableError(lastErr) {
|
||||
return fmt.Errorf("non-retryable error: %w", lastErr)
|
||||
}
|
||||
@ -249,11 +279,6 @@ func (s *ClickHouseSink) executeBatch(ctx context.Context, query string, buffer
|
||||
apacheJSON, _ := json.Marshal(log.Apache)
|
||||
networkJSON, _ := json.Marshal(log.Network)
|
||||
|
||||
orphanSide := log.OrphanSide
|
||||
if !log.Correlated {
|
||||
orphanSide = log.OrphanSide
|
||||
}
|
||||
|
||||
correlated := 0
|
||||
if log.Correlated {
|
||||
correlated = 1
|
||||
@ -266,7 +291,7 @@ func (s *ClickHouseSink) executeBatch(ctx context.Context, query string, buffer
|
||||
log.DstIP,
|
||||
log.DstPort,
|
||||
correlated,
|
||||
orphanSide,
|
||||
log.OrphanSide,
|
||||
string(apacheJSON),
|
||||
string(networkJSON),
|
||||
)
|
||||
@ -287,8 +312,7 @@ func isRetryableError(err error) bool {
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
errStr := err.Error()
|
||||
// Common retryable errors
|
||||
errStr := strings.ToLower(err.Error())
|
||||
retryableErrors := []string{
|
||||
"connection refused",
|
||||
"connection reset",
|
||||
@ -298,36 +322,9 @@ func isRetryableError(err error) bool {
|
||||
"broken pipe",
|
||||
}
|
||||
for _, re := range retryableErrors {
|
||||
if containsIgnoreCase(errStr, re) {
|
||||
if strings.Contains(errStr, re) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func containsIgnoreCase(s, substr string) bool {
|
||||
return len(s) >= len(substr) && containsLower(s, substr)
|
||||
}
|
||||
|
||||
func containsLower(s, substr string) bool {
|
||||
s = toLower(s)
|
||||
substr = toLower(substr)
|
||||
for i := 0; i <= len(s)-len(substr); i++ {
|
||||
if s[i:i+len(substr)] == substr {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func toLower(s string) string {
|
||||
var result []byte
|
||||
for i := 0; i < len(s); i++ {
|
||||
c := s[i]
|
||||
if c >= 'A' && c <= 'Z' {
|
||||
c = c + ('a' - 'A')
|
||||
}
|
||||
result = append(result, c)
|
||||
}
|
||||
return string(result)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user