-- ============================================================================= -- 06_ml_tables.sql — ML detection results tables -- Source: bot_detector/deploy_views.sql sections 6-6b + deploy_schema.sql items 11-12 -- ============================================================================= -- ----------------------------------------------------------------------------- -- ml_detected_anomalies — anomaly detections above threat threshold -- ----------------------------------------------------------------------------- CREATE TABLE IF NOT EXISTS mabase_prod.ml_detected_anomalies ( detected_at DateTime, src_ip IPv6, ja4 String, host String, bot_name String, anomaly_score Float32, threat_level String, model_name String, recurrence UInt32, asn_number String, asn_org String, asn_detail String, asn_domain String, country_code String, asn_label String, hits UInt64, hit_velocity Float32, fuzzing_index Float32, post_ratio Float32, port_exhaustion_ratio Float32, max_keepalives UInt32, orphan_ratio Float32, tcp_jitter_variance Float32, tcp_shared_count UInt32, true_window_size UInt64, window_mss_ratio Float32, alpn_http_mismatch UInt8, is_alpn_missing UInt8, sni_host_mismatch UInt8, header_count UInt16, has_accept_language UInt8, has_cookie UInt8, has_referer UInt8, modern_browser_score UInt8, is_headless UInt8, ua_ch_mismatch UInt8, header_order_shared_count UInt32, ip_id_zero_ratio Float32, request_size_variance Float32, multiplexing_efficiency Float32, mss_mobile_mismatch UInt8, correlated UInt8, reason String, asset_ratio Float32, direct_access_ratio Float32, is_ua_rotating UInt8, distinct_ja4_count UInt32, src_port_density Float32, ja4_asn_concentration Float32, ja4_country_concentration Float32, is_rare_ja4 UInt8, header_order_confidence Float32, distinct_header_orders UInt32, temporal_entropy Float32, path_diversity_ratio Float32, url_depth_variance Float32, anomalous_payload_ratio Float32, -- v11 additions campaign_id Int32 DEFAULT -1, raw_anomaly_score Float32 DEFAULT 0, -- Anubis enrichment (deploy_schema.sql item 11) anubis_bot_name LowCardinality(String) DEFAULT '', anubis_bot_action LowCardinality(String) DEFAULT '', anubis_bot_category LowCardinality(String) DEFAULT '' ) ENGINE = ReplacingMergeTree(detected_at) ORDER BY (src_ip) TTL detected_at + INTERVAL 30 DAY; -- ----------------------------------------------------------------------------- -- ml_all_scores — all classifications (no threshold, for observability) -- ----------------------------------------------------------------------------- CREATE TABLE IF NOT EXISTS mabase_prod.ml_all_scores ( detected_at DateTime, window_start DateTime, src_ip IPv6, ja4 String, host String, bot_name String, anomaly_score Float32, raw_anomaly_score Float32, threat_level String, model_name String, correlated UInt8, asn_number String, asn_org String, country_code String, asn_label String, hits UInt64, hit_velocity Float32, fuzzing_index Float32, post_ratio Float32, campaign_id Int32, -- Anubis enrichment (deploy_schema.sql item 12) anubis_bot_name LowCardinality(String) DEFAULT '', anubis_bot_action LowCardinality(String) DEFAULT '', anubis_bot_category LowCardinality(String) DEFAULT '' ) ENGINE = ReplacingMergeTree(detected_at) ORDER BY (window_start, src_ip, ja4, host, model_name) TTL window_start + INTERVAL 3 DAY SETTINGS index_granularity = 8192; -- ----------------------------------------------------------------------------- -- view_ip_recurrence — recurrence aggregation over ml_detected_anomalies -- ----------------------------------------------------------------------------- CREATE OR REPLACE VIEW mabase_prod.view_ip_recurrence AS SELECT src_ip, count() AS recurrence, min(detected_at) AS first_seen, max(detected_at) AS last_seen, min(anomaly_score) AS worst_score, argMin(threat_level, anomaly_score) AS worst_threat_level FROM mabase_prod.ml_detected_anomalies GROUP BY src_ip;