From 79d86eba23b56718253bcc37b400fb0ad70708d3 Mon Sep 17 00:00:00 2001 From: toto Date: Thu, 12 Mar 2026 11:21:11 +0100 Subject: [PATCH] schema sql --- sql/bots.sql | 11 +++ sql/views.sql | 252 +++++++++++++++++++++++--------------------------- 2 files changed, 127 insertions(+), 136 deletions(-) diff --git a/sql/bots.sql b/sql/bots.sql index a96eb34..2e1e4ad 100644 --- a/sql/bots.sql +++ b/sql/bots.sql @@ -8,3 +8,14 @@ CREATE TABLE mabase_prod.ref_bot_networks ( last_update DateTime ) ENGINE = ReplacingMergeTree(last_update) ORDER BY (network, bot_name); + + +-- Création de la table lisant le fichier des IPs +CREATE TABLE mabase_prod.bot_ip ( + ip String +) ENGINE = File(CSV, 'bot_ip.csv'); + +-- Création de la table lisant le fichier des signatures JA4 +CREATE TABLE mabase_prod.bot_ja4 ( + ja4 String +) ENGINE = File(CSV, 'bot_ja4.csv'); diff --git a/sql/views.sql b/sql/views.sql index 9860862..18373a9 100644 --- a/sql/views.sql +++ b/sql/views.sql @@ -1,28 +1,21 @@ -- ============================================================================ --- SCRIPT DE DÉPLOIEMENT DES VUES DE DÉTECTION DE BOTS & SPAM (CLICKHOUSE) +-- SCRIPT DE DÉPLOIEMENT DES VUES DE DÉTECTION (CLICKHOUSE) - MABASE_PROD -- ============================================================================ --- ---------------------------------------------------------------------------- --- 1. NETTOYAGE STRICT --- ---------------------------------------------------------------------------- DROP TABLE IF EXISTS mabase_prod.ml_detected_anomalies; - DROP VIEW IF EXISTS mabase_prod.view_ai_features_1h; DROP VIEW IF EXISTS mabase_prod.view_host_ip_ja4_rotation; DROP VIEW IF EXISTS mabase_prod.view_host_ja4_anomalies; DROP VIEW IF EXISTS mabase_prod.view_form_bruteforce_detected; DROP VIEW IF EXISTS mabase_prod.view_alpn_mismatch_detected; DROP VIEW IF EXISTS mabase_prod.view_tcp_spoofing_detected; - DROP VIEW IF EXISTS mabase_prod.mv_agg_host_ip_ja4_1h; DROP TABLE IF EXISTS mabase_prod.agg_host_ip_ja4_1h; - DROP VIEW IF EXISTS mabase_prod.mv_agg_header_fingerprint_1h; DROP TABLE IF EXISTS mabase_prod.agg_header_fingerprint_1h; - -- ---------------------------------------------------------------------------- --- 2. TABLES D'AGRÉGATION ET VUES MATÉRIALISÉES (TEMPS RÉEL) +-- 1. TABLES D'AGRÉGATION ET VUES MATÉRIALISÉES -- ---------------------------------------------------------------------------- CREATE TABLE mabase_prod.agg_host_ip_ja4_1h ( @@ -44,41 +37,52 @@ CREATE TABLE mabase_prod.agg_host_ip_ja4_1h ( tcp_mss SimpleAggregateFunction(any, UInt32), tcp_ttl SimpleAggregateFunction(any, UInt32), http_version SimpleAggregateFunction(any, String), - first_ua SimpleAggregateFunction(any, String) + tls_alpn SimpleAggregateFunction(any, String), + tls_sni SimpleAggregateFunction(any, String), + first_ua SimpleAggregateFunction(any, String), + correlated SimpleAggregateFunction(max, UInt8), + unique_src_ports AggregateFunction(uniq, UInt16), + max_keepalives SimpleAggregateFunction(max, UInt32), + orphan_count SimpleAggregateFunction(sum, UInt64) ) ENGINE = AggregatingMergeTree() ORDER BY (window_start, src_ip, ja4, host) TTL window_start + INTERVAL 7 DAY; -CREATE MATERIALIZED VIEW mabase_prod.mv_agg_host_ip_ja4_1h -TO mabase_prod.agg_host_ip_ja4_1h AS +CREATE MATERIALIZED VIEW mabase_prod.mv_agg_host_ip_ja4_1h TO mabase_prod.agg_host_ip_ja4_1h AS SELECT - toStartOfHour(time) AS window_start, - src_ip, - ja4, - host, - min(time) AS first_seen, - max(time) AS last_seen, + toStartOfHour(src.time) AS window_start, + src.src_ip, src.ja4, src.host, + min(src.time) AS first_seen, max(src.time) AS last_seen, count() AS hits, - sum(IF(method = 'POST', 1, 0)) AS count_post, - uniqState(path) AS uniq_paths, - uniqState(query) AS uniq_query_params, - any(src_country_code) AS src_country_code, - any(toString(cityHash64(concat(toString(tcp_meta_window_size), toString(tcp_meta_mss), toString(tcp_meta_window_scale), tcp_meta_options)))) AS tcp_fingerprint, - varPopState(toFloat64(syn_to_clienthello_ms)) AS tcp_jitter_variance, - any(tcp_meta_window_size) AS tcp_window_size, - any(tcp_meta_window_scale) AS tcp_window_scale, - any(tcp_meta_mss) AS tcp_mss, - any(ip_meta_ttl) AS tcp_ttl, - any(http_version) AS http_version, - any(header_user_agent) AS first_ua -FROM mabase_prod.http_logs -GROUP BY window_start, src_ip, ja4, host; - + sum(IF(src.method = 'POST', 1, 0)) AS count_post, + uniqState(src.path) AS uniq_paths, + uniqState(src.query) AS uniq_query_params, + any(src.src_country_code) AS src_country_code, + any(toString(cityHash64(concat(toString(src.tcp_meta_window_size), toString(src.tcp_meta_mss), toString(src.tcp_meta_window_scale), src.tcp_meta_options)))) AS tcp_fingerprint, + varPopState(toFloat64(src.syn_to_clienthello_ms)) AS tcp_jitter_variance, + any(src.tcp_meta_window_size) AS tcp_window_size, + any(src.tcp_meta_window_scale) AS tcp_window_scale, + any(src.tcp_meta_mss) AS tcp_mss, + any(src.ip_meta_ttl) AS tcp_ttl, + any(src.http_version) AS http_version, + any(src.tls_alpn) AS tls_alpn, + any(src.tls_sni) AS tls_sni, + any(src.header_user_agent) AS first_ua, + max(toUInt8(src.correlated)) AS correlated, + uniqState(toUInt16(src.src_port)) AS unique_src_ports, + max(toUInt32(src.keepalives)) AS max_keepalives, + sum(IF(src.orphan_side = 'A' OR toUInt8(src.correlated) = 0, 1, 0)) AS orphan_count +FROM mabase_prod.http_logs AS src +GROUP BY window_start, src.src_ip, src.ja4, src.host; CREATE TABLE mabase_prod.agg_header_fingerprint_1h ( window_start DateTime, src_ip String, header_order_hash SimpleAggregateFunction(any, String), + header_count SimpleAggregateFunction(max, UInt16), + has_accept_language SimpleAggregateFunction(max, UInt8), + has_cookie SimpleAggregateFunction(max, UInt8), + has_referer SimpleAggregateFunction(max, UInt8), -- NOUVEAU (JA4H_a) modern_browser_score SimpleAggregateFunction(max, UInt8), sec_fetch_mode SimpleAggregateFunction(any, String), sec_fetch_dest SimpleAggregateFunction(any, String), @@ -87,22 +91,24 @@ CREATE TABLE mabase_prod.agg_header_fingerprint_1h ( ORDER BY (window_start, src_ip) TTL window_start + INTERVAL 7 DAY; -CREATE MATERIALIZED VIEW mabase_prod.mv_agg_header_fingerprint_1h -TO mabase_prod.agg_header_fingerprint_1h AS +CREATE MATERIALIZED VIEW mabase_prod.mv_agg_header_fingerprint_1h TO mabase_prod.agg_header_fingerprint_1h AS SELECT - toStartOfHour(time) AS window_start, - src_ip, - any(toString(cityHash64(client_headers))) AS header_order_hash, - max(toUInt8(if(length(header_sec_ch_ua) > 0, 100, if(length(header_user_agent) > 0, 50, 0)))) AS modern_browser_score, - any(header_sec_fetch_mode) AS sec_fetch_mode, - any(header_sec_fetch_dest) AS sec_fetch_dest, - sum(IF(header_sec_fetch_site = 'none', 1, 0)) AS count_site_none -FROM mabase_prod.http_logs -GROUP BY window_start, src_ip; - + toStartOfHour(src.time) AS window_start, + src.src_ip, + any(toString(cityHash64(src.client_headers))) AS header_order_hash, + max(toUInt16(length(src.client_headers) - length(replaceAll(src.client_headers, ',', '')) + 1)) AS header_count, + max(toUInt8(if(position(src.client_headers, 'Accept-Language') > 0, 1, 0))) AS has_accept_language, + max(toUInt8(if(position(src.client_headers, 'Cookie') > 0, 1, 0))) AS has_cookie, + max(toUInt8(if(position(src.client_headers, 'Referer') > 0, 1, 0))) AS has_referer, -- NOUVEAU (JA4H_a) + max(toUInt8(if(length(src.header_sec_ch_ua) > 0, 100, if(length(src.header_user_agent) > 0, 50, 0)))) AS modern_browser_score, + any(src.header_sec_fetch_mode) AS sec_fetch_mode, + any(src.header_sec_fetch_dest) AS sec_fetch_dest, + sum(IF(src.header_sec_fetch_site = 'none', 1, 0)) AS count_site_none +FROM mabase_prod.http_logs AS src +GROUP BY window_start, src.src_ip; -- ---------------------------------------------------------------------------- --- 3. TABLE DE DESTINATION POUR LE MACHINE LEARNING +-- 2. TABLE DES ANOMALIES ÉLARGIE POUR GRAFANA -- ---------------------------------------------------------------------------- CREATE TABLE mabase_prod.ml_detected_anomalies ( detected_at DateTime, @@ -110,52 +116,78 @@ CREATE TABLE mabase_prod.ml_detected_anomalies ( ja4 String, host String, anomaly_score Float32, + + -- Dimensions Applicatives et Comportementales + hits UInt64, + hit_velocity Float32, + fuzzing_index Float32, + post_ratio Float32, + site_none_ratio Float32, + + -- Dimensions Réseau / TCP + port_exhaustion_ratio Float32, + max_keepalives UInt32, + orphan_ratio Float32, + tcp_jitter_variance Float32, + tcp_shared_count UInt32, + true_window_size UInt64, + window_mss_ratio Float32, + + -- Dimensions TLS / Contextuelles + alpn_http_mismatch UInt8, + is_alpn_missing UInt8, + sni_host_mismatch UInt8, + + -- Dimensions JA4H (Headers) + header_count UInt16, + has_accept_language UInt8, + has_cookie UInt8, + has_referer UInt8, + modern_browser_score UInt8, + is_headless UInt8, + header_order_shared_count UInt32, -- NOUVEAU (JA4H_b) + reason String ) ENGINE = MergeTree() ORDER BY (detected_at, src_ip, ja4) TTL detected_at + INTERVAL 30 DAY; - -- ---------------------------------------------------------------------------- --- 4. VUE DE FEATURE ENGINEERING POUR L'ISOLATION FOREST (RÉSOLUE) +-- 3. VUE DE FEATURE ENGINEERING POUR L'IA (SUR 24H) -- ---------------------------------------------------------------------------- --- Utilisation de sous-requêtes agrégées (GROUP BY explicite) avant la jointure --- pour éviter les erreurs d'état et le produit cartésien. -CREATE VIEW mabase_prod.view_ai_features_1h AS +CREATE OR REPLACE VIEW mabase_prod.view_ai_features_1h AS SELECT - a.src_ip, - a.ja4, - a.host, - a.hits, - a.uniq_paths, - a.uniq_query_params, - a.count_post, - - -- Indicateur de Corrélation L4/L7 - IF(length(a.ja4) > 0 AND length(a.tcp_fingerprint) > 0, 1, 0) AS correlated, + a.src_ip, a.ja4, a.host, a.hits, a.uniq_paths, a.uniq_query_params, a.count_post, + a.correlated AS correlated, - -- DIMENSIONS COMPORTEMENTALES (a.count_post / (a.hits + 1)) AS post_ratio, (a.uniq_query_params / (a.uniq_paths + 1)) AS fuzzing_index, (a.hits / (dateDiff('second', a.first_seen, a.last_seen) + 1)) AS hit_velocity, + (a.unique_src_ports / (a.hits + 1)) AS port_exhaustion_ratio, + (a.orphan_count / (a.hits + 1)) AS orphan_ratio, + a.max_keepalives AS max_keepalives, - -- DIMENSIONS TCP / L4 COALESCE(a.tcp_jitter_variance, 0) AS tcp_jitter_variance, count() OVER (PARTITION BY a.tcp_fingerprint) AS tcp_shared_count, a.tcp_window_size * exp2(a.tcp_window_scale) AS true_window_size, IF(a.tcp_mss > 0, a.tcp_window_size / a.tcp_mss, 0) AS window_mss_ratio, - -- DIMENSIONS TLS / L5 (Mismatch) - IF(substring(a.ja4, 10, 2) = 'h2' AND a.http_version!= '2', 1, 0) AS alpn_http_mismatch, - IF(substring(a.ja4, 10, 2) = '00', 1, 0) AS is_alpn_missing, + IF(a.tls_alpn = 'h2' AND a.http_version!= '2', 1, 0) AS alpn_http_mismatch, + IF(length(a.tls_alpn) = 0 OR a.tls_alpn = '00', 1, 0) AS is_alpn_missing, + IF(length(a.tls_sni) > 0 AND a.tls_sni!= a.host, 1, 0) AS sni_host_mismatch, - -- DIMENSIONS HTTP / L7 + COALESCE(h.header_count, 0) AS header_count, + COALESCE(h.has_accept_language, 0) AS has_accept_language, + COALESCE(h.has_cookie, 0) AS has_cookie, + COALESCE(h.has_referer, 0) AS has_referer, COALESCE(h.modern_browser_score, 0) AS modern_browser_score, IF(h.sec_fetch_mode = 'navigate' AND h.sec_fetch_dest!= 'document', 1, 0) AS is_fake_navigation, - (h.count_site_none / (a.hits + 1)) AS site_none_ratio + (h.count_site_none / (a.hits + 1)) AS site_none_ratio, + + -- JA4H_b : Regroupement par ordre de header pour détecter les botnets + count() OVER (PARTITION BY h.header_order_hash) AS header_order_shared_count FROM ( - -- Consolidation des logs d'hôtes (Résolution du GROUP BY manquant) SELECT window_start, src_ip, ja4, host, sum(hits) AS hits, @@ -169,83 +201,31 @@ FROM ( any(tcp_window_size) AS tcp_window_size, any(tcp_window_scale) AS tcp_window_scale, any(tcp_mss) AS tcp_mss, - any(http_version) AS http_version + any(http_version) AS http_version, + any(tls_alpn) AS tls_alpn, + any(tls_sni) AS tls_sni, + max(correlated) AS correlated, + uniqMerge(unique_src_ports) AS unique_src_ports, + max(max_keepalives) AS max_keepalives, + sum(orphan_count) AS orphan_count FROM mabase_prod.agg_host_ip_ja4_1h - WHERE window_start >= toStartOfHour(now() - INTERVAL 2 HOUR) + WHERE window_start >= toStartOfHour(now() - INTERVAL 24 HOUR) GROUP BY window_start, src_ip, ja4, host ) a LEFT JOIN ( - -- Consolidation des en-têtes SELECT window_start, src_ip, + any(header_order_hash) AS header_order_hash, + max(header_count) AS header_count, + max(has_accept_language) AS has_accept_language, + max(has_cookie) AS has_cookie, + max(has_referer) AS has_referer, max(modern_browser_score) AS modern_browser_score, any(sec_fetch_mode) AS sec_fetch_mode, any(sec_fetch_dest) AS sec_fetch_dest, sum(count_site_none) AS count_site_none FROM mabase_prod.agg_header_fingerprint_1h - WHERE window_start >= toStartOfHour(now() - INTERVAL 2 HOUR) + WHERE window_start >= toStartOfHour(now() - INTERVAL 24 HOUR) GROUP BY window_start, src_ip ) h -ON a.src_ip = h.src_ip AND a.window_start = h.window_start; - - --- ---------------------------------------------------------------------------- --- 5. VUES DE DÉTECTION HEURISTIQUES STATIQUES (RÉSOLUES) --- ---------------------------------------------------------------------------- - -CREATE VIEW mabase_prod.view_host_ip_ja4_rotation AS -SELECT - src_ip, - uniqExact(ja4) AS distinct_ja4_count, - sum(hits) AS total_hits -FROM mabase_prod.agg_host_ip_ja4_1h -WHERE window_start >= toStartOfHour(now() - INTERVAL 1 HOUR) -GROUP BY src_ip -HAVING distinct_ja4_count >= 5 AND total_hits > 100; - -CREATE VIEW mabase_prod.view_host_ja4_anomalies AS -SELECT - ja4, - uniqExact(src_ip) AS unique_ips, - uniqExact(src_country_code) AS unique_countries, - uniqExact(host) AS targeted_hosts -FROM mabase_prod.agg_host_ip_ja4_1h -WHERE window_start >= toStartOfHour(now() - INTERVAL 1 HOUR) -GROUP BY ja4 -HAVING unique_ips >= 20 AND targeted_hosts >= 3; - --- Ajout du GROUP BY -CREATE VIEW mabase_prod.view_form_bruteforce_detected AS -SELECT - src_ip, ja4, host, - sum(hits) AS hits, - uniqMerge(uniq_query_params) AS query_params_count -FROM mabase_prod.agg_host_ip_ja4_1h -WHERE window_start >= toStartOfHour(now() - INTERVAL 1 HOUR) -GROUP BY src_ip, ja4, host -HAVING query_params_count >= 10 AND hits >= 20; - --- Ajout du GROUP BY -CREATE VIEW mabase_prod.view_alpn_mismatch_detected AS -SELECT - src_ip, ja4, host, - sum(hits) AS hits, - any(http_version) AS http_version -FROM mabase_prod.agg_host_ip_ja4_1h -WHERE window_start >= toStartOfHour(now() - INTERVAL 1 HOUR) - AND substring(ja4, 10, 2) IN ('h2', 'h3') -GROUP BY src_ip, ja4, host -HAVING http_version = '1.1' AND hits >= 10; - --- Ajout du GROUP BY -CREATE VIEW mabase_prod.view_tcp_spoofing_detected AS -SELECT - src_ip, ja4, - any(tcp_ttl) AS tcp_ttl, - any(tcp_window_size) AS tcp_window_size, - any(first_ua) AS first_ua -FROM mabase_prod.agg_host_ip_ja4_1h -WHERE window_start >= toStartOfHour(now() - INTERVAL 1 HOUR) -GROUP BY src_ip, ja4 -HAVING tcp_ttl <= 64 - AND (first_ua ILIKE '%Windows%' OR first_ua ILIKE '%iPhone%'); +ON a.src_ip = h.src_ip AND a.window_start = a.window_start;