Files
ja4-platform/shared/clickhouse/06_ml_tables.sql
toto d469e39da7 feat: ja4-platform monorepo — 5 services unified, tests & RPM builds standardized
Services:
- ja4sentinel: TLS/JA4 fingerprint capture daemon (Go, libpcap)
- logcorrelator: JA4 log correlation engine (Go, ClickHouse)
- mod_reqin_log: Apache module (C, JSON request logging)
- bot_detector: ML bot detection pipeline (Python)
- dashboard: FastAPI/Streamlit analytics UI (Python)

Shared libraries:
- shared/go/ja4common: logger, config, shutdown, ipfilter (Go module)
- shared/python/ja4_common: ClickHouseClient, ClickHouseSettings (Python package)
- shared/clickhouse/: canonical SQL migrations (10 files)

Build & packaging:
- Unified 3-stage Dockerfile.package for Go RPMs (el8/el9/el10)
- go.work workspace linking sentinel, correlator, ja4common
- Makefile with test-all, build-all, rpm-* targets

Fixes applied:
- go.work: 1.21 → 1.24.6 (required by sentinel)
- correlator Dockerfiles: golang:1.21 → golang:1.24
- replace directives in go.mod for ja4common local path
- pyproject.toml: setuptools.backends → setuptools.build_meta
- Removed static libpcap linking (unavailable on Rocky 9)
- Fixed data races in output/writers_test.go (sync.Mutex + atomic.Int32)
- Rewrote corrupted test files (logger_test.go × 2)

Test coverage:
- correlator: 67.1% total (unixsocket 80.5%, config 91.7%, app 83.3%, multi 87.7%, stdout 100%)
- sentinel: all 10 packages pass (api, capture, config, fingerprint, ipfilter, logging, output, tlsparse)

Documentation:
- README.md + docs/ (architecture, development, 5 services, shared libs, DB schema & migrations)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-04-07 16:42:59 +02:00

91 lines
4.2 KiB
SQL

-- =============================================================================
-- 06_ml_tables.sql — ML detection results tables
-- Source: bot_detector/deploy_views.sql sections 6-6b + deploy_schema.sql items 11-12
-- =============================================================================
-- -----------------------------------------------------------------------------
-- ml_detected_anomalies — anomaly detections above threat threshold
-- -----------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS mabase_prod.ml_detected_anomalies
(
detected_at DateTime, src_ip IPv6, ja4 String, host String, bot_name String,
anomaly_score Float32, threat_level String, model_name String, recurrence UInt32,
asn_number String, asn_org String, asn_detail String, asn_domain String,
country_code String, asn_label String,
hits UInt64, hit_velocity Float32, fuzzing_index Float32, post_ratio Float32,
port_exhaustion_ratio Float32, max_keepalives UInt32, orphan_ratio Float32,
tcp_jitter_variance Float32, tcp_shared_count UInt32, true_window_size UInt64,
window_mss_ratio Float32, alpn_http_mismatch UInt8, is_alpn_missing UInt8,
sni_host_mismatch UInt8, header_count UInt16, has_accept_language UInt8,
has_cookie UInt8, has_referer UInt8, modern_browser_score UInt8, is_headless UInt8,
ua_ch_mismatch UInt8, header_order_shared_count UInt32, ip_id_zero_ratio Float32,
request_size_variance Float32, multiplexing_efficiency Float32,
mss_mobile_mismatch UInt8, correlated UInt8, reason String,
asset_ratio Float32, direct_access_ratio Float32, is_ua_rotating UInt8,
distinct_ja4_count UInt32, src_port_density Float32, ja4_asn_concentration Float32,
ja4_country_concentration Float32, is_rare_ja4 UInt8, header_order_confidence Float32,
distinct_header_orders UInt32, temporal_entropy Float32, path_diversity_ratio Float32,
url_depth_variance Float32, anomalous_payload_ratio Float32,
-- v11 additions
campaign_id Int32 DEFAULT -1,
raw_anomaly_score Float32 DEFAULT 0,
-- Anubis enrichment (deploy_schema.sql item 11)
anubis_bot_name LowCardinality(String) DEFAULT '',
anubis_bot_action LowCardinality(String) DEFAULT '',
anubis_bot_category LowCardinality(String) DEFAULT ''
)
ENGINE = ReplacingMergeTree(detected_at)
ORDER BY (src_ip)
TTL detected_at + INTERVAL 30 DAY;
-- -----------------------------------------------------------------------------
-- ml_all_scores — all classifications (no threshold, for observability)
-- -----------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS mabase_prod.ml_all_scores
(
detected_at DateTime,
window_start DateTime,
src_ip IPv6,
ja4 String,
host String,
bot_name String,
anomaly_score Float32,
raw_anomaly_score Float32,
threat_level String,
model_name String,
correlated UInt8,
asn_number String,
asn_org String,
country_code String,
asn_label String,
hits UInt64,
hit_velocity Float32,
fuzzing_index Float32,
post_ratio Float32,
campaign_id Int32,
-- Anubis enrichment (deploy_schema.sql item 12)
anubis_bot_name LowCardinality(String) DEFAULT '',
anubis_bot_action LowCardinality(String) DEFAULT '',
anubis_bot_category LowCardinality(String) DEFAULT ''
)
ENGINE = ReplacingMergeTree(detected_at)
ORDER BY (window_start, src_ip, ja4, host, model_name)
TTL window_start + INTERVAL 3 DAY
SETTINGS index_granularity = 8192;
-- -----------------------------------------------------------------------------
-- view_ip_recurrence — recurrence aggregation over ml_detected_anomalies
-- -----------------------------------------------------------------------------
CREATE OR REPLACE VIEW mabase_prod.view_ip_recurrence AS
SELECT
src_ip,
count() AS recurrence,
min(detected_at) AS first_seen,
max(detected_at) AS last_seen,
min(anomaly_score) AS worst_score,
argMin(threat_level, anomaly_score) AS worst_threat_level
FROM mabase_prod.ml_detected_anomalies
GROUP BY src_ip;