feat: observability, IP filtering, stdout/clickhouse fixes (v1.1.11)

- feat(observability): metrics server with /metrics and /health endpoints
- feat(observability): correlation metrics (events, success/failed, reasons, buffers)
- feat(correlation): IP exclusion filter (exact IPs and CIDR ranges)
- feat(correlation): pending orphan delay for late-arriving B events
- fix(stdout): sink is now a no-op for data; JSON must never appear on stdout
- fix(clickhouse): all flush errors were silently discarded, now properly logged
- fix(clickhouse): buffer overflow with DropOnOverflow now logged at WARN
- fix(clickhouse): retry attempts logged at WARN with attempt/delay/error context
- feat(clickhouse): connection success logged at INFO, batch sends at DEBUG
- feat(clickhouse): SetLogger() for external logger injection
- test(stdout): assert stdout remains empty for correlated and orphan logs
- chore(rpm): bump version to 1.1.11, update changelog
- docs: README and architecture.yml updated

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
toto
2026-03-05 11:40:54 +01:00
parent 68f0fcf810
commit e9dcd8ea51
16 changed files with 2035 additions and 116 deletions

404
scripts/test-correlation.sh Executable file
View File

@ -0,0 +1,404 @@
#!/bin/bash
#
# test-correlation.sh - Test script for log correlation debugging
#
# This script sends test HTTP (A) and Network (B) events to the logcorrelator
# Unix sockets and verifies that correlation is working correctly.
#
# Usage:
# ./test-correlation.sh [options]
#
# Options:
# -h, --http-socket PATH Path to HTTP socket (default: /var/run/logcorrelator/http.socket)
# -n, --network-socket PATH Path to Network socket (default: /var/run/logcorrelator/network.socket)
# -c, --count NUM Number of test pairs to send (default: 10)
# -d, --delay MS Delay between pairs in milliseconds (default: 100)
# -v, --verbose Enable verbose output
# -m, --metrics-url URL Metrics server URL (default: http://localhost:8080/metrics)
# --skip-metrics Skip metrics check
# --help Show this help message
#
set -e
# Default values
HTTP_SOCKET="/var/run/logcorrelator/http.socket"
NETWORK_SOCKET="/var/run/logcorrelator/network.socket"
COUNT=10
DELAY_MS=100
VERBOSE=false
METRICS_URL="http://localhost:8080/metrics"
SKIP_METRICS=false
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Print functions
info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
success() {
echo -e "${GREEN}[OK]${NC} $1"
}
warn() {
echo -e "${YELLOW}[WARN]${NC} $1"
}
error() {
echo -e "${RED}[ERROR]${NC} $1"
}
verbose() {
if [ "$VERBOSE" = true ]; then
echo -e "${BLUE}[DEBUG]${NC} $1"
fi
}
# Show help
show_help() {
head -20 "$0" | tail -17 | sed 's/^#//' | sed 's/^ //'
exit 0
}
# Parse arguments
while [[ $# -gt 0 ]]; do
case $1 in
-h|--http-socket)
HTTP_SOCKET="$2"
shift 2
;;
-n|--network-socket)
NETWORK_SOCKET="$2"
shift 2
;;
-c|--count)
COUNT="$2"
shift 2
;;
-d|--delay)
DELAY_MS="$2"
shift 2
;;
-v|--verbose)
VERBOSE=true
shift
;;
-m|--metrics-url)
METRICS_URL="$2"
shift 2
;;
--skip-metrics)
SKIP_METRICS=true
shift
;;
--help)
show_help
;;
*)
error "Unknown option: $1"
echo "Use --help for usage information"
exit 1
;;
esac
done
# Check if socat or netcat is available
if command -v socat &> /dev/null; then
SEND_CMD="socat"
elif command -v nc &> /dev/null; then
SEND_CMD="nc"
else
error "Neither socat nor nc (netcat) found. Please install one of them."
echo " Ubuntu/Debian: apt-get install socat OR apt-get install netcat-openbsd"
echo " RHEL/CentOS: yum install socat OR yum install nc"
exit 1
fi
# Function to send data to Unix socket
send_to_socket() {
local socket="$1"
local data="$2"
if [ "$SEND_CMD" = "socat" ]; then
echo "$data" | socat - "UNIX-SENDTO:$socket" 2>/dev/null
else
echo "$data" | nc -U -u "$socket" 2>/dev/null
fi
}
# Function to generate timestamp in nanoseconds
get_timestamp_ns() {
date +%s%N
}
# Function to send HTTP (A) event
send_http_event() {
local src_ip="$1"
local src_port="$2"
local timestamp="$3"
local method="${4:-GET}"
local path="${5:-/test}"
local host="${6:-example.com}"
local json=$(cat <<EOF
{"src_ip":"$src_ip","src_port":$src_port,"dst_ip":"10.0.0.1","dst_port":443,"timestamp":$timestamp,"method":"$method","path":"$path","host":"$host","http_version":"HTTP/1.1","header_user_agent":"TestAgent/1.0","header_accept":"*/*"}
EOF
)
verbose "Sending HTTP event: $json"
send_to_socket "$HTTP_SOCKET" "$json"
}
# Function to send Network (B) event
send_network_event() {
local src_ip="$1"
local src_port="$2"
local timestamp="$3"
local ja3="${4:-abc123}"
local ja4="${5:-def456}"
local json=$(cat <<EOF
{"src_ip":"$src_ip","src_port":$src_port,"dst_ip":"10.0.0.1","dst_port":443,"timestamp":$timestamp,"ja3":"$ja3","ja4":"$ja4","tls_version":"TLS1.3","tls_sni":"example.com"}
EOF
)
verbose "Sending Network event: $json"
send_to_socket "$NETWORK_SOCKET" "$json"
}
# Check sockets exist
check_sockets() {
local errors=0
if [ ! -S "$HTTP_SOCKET" ]; then
error "HTTP socket not found: $HTTP_SOCKET"
errors=$((errors + 1))
else
verbose "HTTP socket found: $HTTP_SOCKET"
fi
if [ ! -S "$NETWORK_SOCKET" ]; then
error "Network socket not found: $NETWORK_SOCKET"
errors=$((errors + 1))
else
verbose "Network socket found: $NETWORK_SOCKET"
fi
if [ $errors -gt 0 ]; then
error "$errors socket(s) not found. Is logcorrelator running?"
exit 1
fi
success "Sockets check passed"
}
# Get metrics from server
get_metrics() {
if [ "$SKIP_METRICS" = true ]; then
return 0
fi
if command -v curl &> /dev/null; then
curl -s "$METRICS_URL" 2>/dev/null || echo "{}"
elif command -v wget &> /dev/null; then
wget -qO- "$METRICS_URL" 2>/dev/null || echo "{}"
else
warn "Neither curl nor wget found. Skipping metrics check."
echo "{}"
fi
}
# Extract value from JSON (simple grep-based, requires jq for complex queries)
get_json_value() {
local json="$1"
local key="$2"
if command -v jq &> /dev/null; then
echo "$json" | jq -r ".$key // 0"
else
# Fallback: simple grep (works for flat JSON)
echo "$json" | grep -o "\"$key\":[0-9]*" | cut -d: -f2 || echo "0"
fi
}
# Main test function
run_test() {
info "Starting correlation test..."
info "Configuration:"
echo " HTTP Socket: $HTTP_SOCKET"
echo " Network Socket: $NETWORK_SOCKET"
echo " Test pairs: $COUNT"
echo " Delay between: ${DELAY_MS}ms"
echo " Metrics URL: $METRICS_URL"
echo " Send command: $SEND_CMD"
echo ""
# Get initial metrics
info "Fetching initial metrics..."
local initial_metrics=$(get_metrics)
local initial_success=$(get_json_value "$initial_metrics" "correlations_success")
local initial_failed=$(get_json_value "$initial_metrics" "correlations_failed")
local initial_a=$(get_json_value "$initial_metrics" "events_received_a")
local initial_b=$(get_json_value "$initial_metrics" "events_received_b")
info "Initial metrics:"
echo " Events A: $initial_a"
echo " Events B: $initial_b"
echo " Success: $initial_success"
echo " Failed: $initial_failed"
echo ""
# Send test events
info "Sending $COUNT test event pairs..."
local base_timestamp=$(get_timestamp_ns)
local sent=0
local correlated=0
for i in $(seq 1 $COUNT); do
local src_ip="192.168.1.$((i % 254 + 1))"
local src_port=$((8000 + i))
# Send A and B with same timestamp (should correlate)
local ts_a=$((base_timestamp + i * 1000000))
local ts_b=$ts_a # Same timestamp for perfect correlation
send_http_event "$src_ip" "$src_port" "$ts_a"
send_network_event "$src_ip" "$src_port" "$ts_b"
sent=$((sent + 1))
verbose "Sent pair $i: $src_ip:$src_port"
if [ $DELAY_MS -gt 0 ]; then
sleep $(echo "scale=3; $DELAY_MS / 1000" | bc)
fi
done
success "Sent $sent event pairs"
echo ""
# Wait for processing
info "Waiting for processing (2 seconds)..."
sleep 2
# Get final metrics
info "Fetching final metrics..."
local final_metrics=$(get_metrics)
local final_success=$(get_json_value "$final_metrics" "correlations_success")
local final_failed=$(get_json_value "$final_metrics" "correlations_failed")
local final_a=$(get_json_value "$final_metrics" "events_received_a")
local final_b=$(get_json_value "$final_metrics" "events_received_b")
# Calculate deltas
local delta_success=$((final_success - initial_success))
local delta_failed=$((final_failed - initial_failed))
local delta_a=$((final_a - initial_a))
local delta_b=$((final_b - initial_b))
echo ""
info "Results:"
echo " Events A sent: $delta_a (expected: $sent)"
echo " Events B sent: $delta_b (expected: $sent)"
echo " Correlations: $delta_success"
echo " Failures: $delta_failed"
echo ""
# Validation
local test_passed=true
if [ "$delta_a" -ne "$sent" ]; then
error "Event A count mismatch: got $delta_a, expected $sent"
test_passed=false
fi
if [ "$delta_b" -ne "$sent" ]; then
error "Event B count mismatch: got $delta_b, expected $sent"
test_passed=false
fi
if [ "$delta_success" -ne "$sent" ]; then
error "Correlation count mismatch: got $delta_success, expected $sent"
test_passed=false
fi
if [ "$delta_failed" -ne 0 ]; then
warn "Unexpected correlation failures: $delta_failed"
fi
if [ "$test_passed" = true ]; then
success "All tests passed! Correlation is working correctly."
exit 0
else
error "Some tests failed. Check the logs for details."
exit 1
fi
}
# Test with time window exceeded
run_time_window_test() {
info "Running time window test (B arrives after time window)..."
local src_ip="192.168.100.1"
local src_port="9999"
# Send A event
local ts_a=$(get_timestamp_ns)
send_http_event "$src_ip" "$src_port" "$ts_a"
info "Sent A event at timestamp $ts_a"
# Wait for time window to expire (default is 10s, we wait 11s)
info "Waiting 11 seconds (time window should expire)..."
sleep 11
# Send B event
local ts_b=$(get_timestamp_ns)
send_network_event "$src_ip" "$src_port" "$ts_b"
info "Sent B event at timestamp $ts_b"
info "This should result in a time_window failure (check metrics)"
}
# Test with different src_ip
run_different_ip_test() {
info "Running different IP test (should NOT correlate)..."
# Send A with IP 192.168.200.1
local ts=$(get_timestamp_ns)
send_http_event "192.168.200.1" "7777" "$ts"
info "Sent A event from 192.168.200.1:7777"
# Send B with different IP
send_network_event "192.168.200.2" "7777" "$ts"
info "Sent B event from 192.168.200.2:7777 (different IP)"
info "These should NOT correlate (different src_ip)"
}
# Run tests
check_sockets
echo ""
# Run main test
run_test
echo ""
info "Additional tests available:"
echo " --test-time-window Test time window expiration"
echo " --test-different-ip Test different IP (no correlation)"
# Check for additional test flags
if [[ "$@" == *"--test-time-window"* ]]; then
echo ""
run_time_window_test
fi
if [[ "$@" == *"--test-different-ip"* ]]; then
echo ""
run_different_ip_test
fi