From 14db3d9040be3ee1fe41d2019914e0f75eace381 Mon Sep 17 00:00:00 2001 From: toto Date: Thu, 9 Apr 2026 23:06:01 +0200 Subject: [PATCH] =?UTF-8?q?refactor:=20suppression=20d=C3=A9pendance=20Use?= =?UTF-8?q?r-Agent=20de=20la=20d=C3=A9tection=20navigateur?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Changements SQL : - modern_browser_score : sec-ch-ua→100, Sec-Fetch→70 (plus de UA fallback) - Ajout has_sec_ch_ua (UInt8) dans agg_header_fingerprint_1h et ml_all_scores - mss_mobile_mismatch utilise has_sec_ch_ua au lieu de modern_browser_score - header_order_confidence : PARTITION BY ja4 au lieu de first_ua - sec_ch_mobile_mismatch : comparaison Client Hints interne (sans UA) - Migration 03_remove_ua_browser_detection.sql Changements Python : - browser.py Axe 3 : Client Hints + Sec-Fetch + is_fake_navigation (PAS de UA) - Pondération axes : ja4_known 0.30, tls_coherence 0.20 (signaux TLS renforcés) - preprocessing.py : has_sec_ch_ua ajouté aux features et binary_features Fichiers modifiés : 8 SQL/Python + 1 migration, 36/36 tests passent. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../anubis/view_ai_features_anubis.sql | 8 ++-- services/bot-detector/bot_detector/browser.py | 28 +++++++----- .../bot_detector/preprocessing.py | 5 ++- .../bot_detector/tests/test_detector.py | 8 ++-- services/bot-detector/deploy_views.sql | 26 ++++++----- .../03_remove_ua_browser_detection.sql | 45 +++++++++++++++++++ shared/clickhouse/05_aggregation_tables.sql | 8 ++-- shared/clickhouse/06_ml_tables.sql | 3 +- shared/clickhouse/07_ai_features_view.sql | 8 ++-- 9 files changed, 101 insertions(+), 38 deletions(-) create mode 100644 services/correlator/sql/migrations/03_remove_ua_browser_detection.sql diff --git a/services/bot-detector/anubis/view_ai_features_anubis.sql b/services/bot-detector/anubis/view_ai_features_anubis.sql index 11bcdc4..ce1a920 100644 --- a/services/bot-detector/anubis/view_ai_features_anubis.sql +++ b/services/bot-detector/anubis/view_ai_features_anubis.sql @@ -38,6 +38,7 @@ WITH base_data AS ( h.header_order_hash AS header_order_hash, h.header_count AS header_count, h.has_accept_language AS has_accept_language, h.has_cookie AS has_cookie, h.has_referer AS has_referer, h.modern_browser_score AS modern_browser_score, + h.has_sec_ch_ua AS has_sec_ch_ua, h.ua_ch_mismatch AS ua_ch_mismatch, (a.count_post / (a.hits + 1)) AS post_ratio, (a.uniq_query_params / (a.uniq_paths + 1)) AS fuzzing_index, @@ -46,7 +47,7 @@ WITH base_data AS ( (a.orphan_count / (a.hits + 1)) AS orphan_ratio, (a.ip_id_zero_count / (a.hits + 1)) AS ip_id_zero_ratio, (a.hits / (a.unique_conn_id + 1)) AS multiplexing_efficiency, - IF(a.mss_1460_count > (a.hits * 0.8) AND h.modern_browser_score > 70, 1, 0) AS mss_mobile_mismatch, + IF(a.mss_1460_count > (a.hits * 0.8) AND h.has_sec_ch_ua > 0, 1, 0) AS mss_mobile_mismatch, a.request_size_variance AS request_size_variance, IF(a.tls_alpn = 'h2' AND a.http_version != '2', 1, 0) AS alpn_http_mismatch, IF(length(a.tls_alpn) = 0 OR a.tls_alpn = '00', 1, 0) AS is_alpn_missing, @@ -62,7 +63,7 @@ WITH base_data AS ( (sum(a.hits) OVER (PARTITION BY a.ja4, a.src_asn) / (sum(a.hits) OVER (PARTITION BY a.ja4) + 1)) AS ja4_asn_concentration, (sum(a.hits) OVER (PARTITION BY a.ja4, a.src_country_code) / (sum(a.hits) OVER (PARTITION BY a.ja4) + 1)) AS ja4_country_concentration, IF(sum(a.hits) OVER (PARTITION BY a.ja4) < 100, 1, 0) AS is_rare_ja4, - (count() OVER (PARTITION BY h.header_order_hash, a.first_ua) / (count() OVER (PARTITION BY a.first_ua) + 1)) AS header_order_confidence, + (count() OVER (PARTITION BY h.header_order_hash, a.ja4) / (count() OVER (PARTITION BY a.ja4) + 1)) AS header_order_confidence, uniqExact(h.header_order_hash) OVER (PARTITION BY a.src_ip) AS distinct_header_orders, (a.uniq_paths / (a.hits + 1)) AS path_diversity_ratio, a.url_depth_variance AS url_depth_variance, @@ -127,7 +128,8 @@ WITH base_data AS ( window_start, src_ip, any(header_order_hash) AS header_order_hash, max(header_count) AS header_count, max(has_accept_language) AS has_accept_language, max(has_cookie) AS has_cookie, max(has_referer) AS has_referer, - max(modern_browser_score) AS modern_browser_score, max(ua_ch_mismatch) AS ua_ch_mismatch, + max(modern_browser_score) AS modern_browser_score, max(has_sec_ch_ua) AS has_sec_ch_ua, + max(ua_ch_mismatch) AS ua_ch_mismatch, any(sec_fetch_mode) AS sec_fetch_mode, any(sec_fetch_dest) AS sec_fetch_dest FROM ja4_processing.agg_header_fingerprint_1h WHERE window_start >= now() - INTERVAL 24 HOUR diff --git a/services/bot-detector/bot_detector/browser.py b/services/bot-detector/bot_detector/browser.py index fcece45..e5f134b 100644 --- a/services/bot-detector/bot_detector/browser.py +++ b/services/bot-detector/bot_detector/browser.py @@ -1,8 +1,8 @@ """A9 — Identification multifactorielle des navigateurs. 5 axes indépendants combinés pour identifier les navigateurs légitimes. -Chaque axe produit un score [0,1]. La combinaison pondérée produit -browser_confidence [0,1], remplaçant l'ancien système binaire JA4-only. +Aucun axe ne repose sur le User-Agent (trivalement falsifiable). +Signaux principaux : JA4 (TLS), Client Hints, Sec-Fetch-*, comportement. """ import pandas as pd import numpy as np @@ -22,10 +22,11 @@ _BROWSER_JA4_PROFILES = { } # Pondération des 5 axes pour le score browser_confidence. +# Favorise les signaux TLS (difficiles à falsifier) sur HTTP. _AXIS_WEIGHTS = { - 'ja4_known': 0.25, # Axe 1 — Signature JA4 dans dict_browser_ja4 + 'ja4_known': 0.30, # Axe 1 — Signature JA4 dans dict_browser_ja4 (TLS fingerprint) 'ja4_struct': 0.15, # Axe 2 — Structure JA4 (TLS1.3, h2, nb ciphers/ext) - 'http_modern': 0.25, # Axe 3 — Headers HTTP modernes (sec-ch-ua, Sec-Fetch-*) + 'http_modern': 0.20, # Axe 3 — Client Hints + Sec-Fetch-* (PAS de User-Agent) 'nav_behavior': 0.15, # Axe 4 — Comportement de navigation (assets, referers) 'tls_coherence': 0.20, # Axe 5 — Cohérence TLS/TCP (pas de mismatch) } @@ -80,18 +81,21 @@ def _compute_browser_axes(df: pd.DataFrame) -> pd.DataFrame: is_tls13 * 0.35 + is_h2h3 * 0.25 + ciphers_ok * 0.20 + exts_ok * 0.20 ) - # ── Axe 3 — Headers HTTP modernes ── - mbs = df.get('modern_browser_score', pd.Series(0, index=df.index)).fillna(0) + # ── Axe 3 — Signaux HTTP sans User-Agent ── + # Client Hints (sec-ch-ua) = signal fort, nécessite un vrai moteur Chromium. + # Sec-Fetch-* = signal moyen, envoyé automatiquement par les navigateurs modernes. + # Le User-Agent n'est PAS utilisé (trivalement falsifiable par n'importe quel bot). + scu = df.get('has_sec_ch_ua', pd.Series(0, index=df.index)).fillna(0) hal = df.get('has_accept_language', pd.Series(0, index=df.index)).fillna(0) sfa = df.get('sec_fetch_absence_rate', pd.Series(1, index=df.index)).fillna(1) gar = df.get('generic_accept_ratio', pd.Series(1, index=df.index)).fillna(1) - uam = df.get('ua_ch_mismatch', pd.Series(0, index=df.index)).fillna(0) + ifn = df.get('is_fake_navigation', pd.Series(0, index=df.index)).fillna(0) axes['axis_http_modern'] = ( - (mbs >= 50).astype(float) * 0.35 - + (hal > 0).astype(float) * 0.20 - + (sfa < 0.3).astype(float) * 0.25 - + (gar < 0.3).astype(float) * 0.10 - + (uam == 0).astype(float) * 0.10 + (scu > 0).astype(float) * 0.30 + + (sfa < 0.2).astype(float) * 0.30 + + (hal > 0).astype(float) * 0.15 + + (gar < 0.3).astype(float) * 0.15 + + (ifn == 0).astype(float) * 0.10 ) # ── Axe 4 — Comportement de navigation ── diff --git a/services/bot-detector/bot_detector/preprocessing.py b/services/bot-detector/bot_detector/preprocessing.py index d55ddc7..a1018f7 100644 --- a/services/bot-detector/bot_detector/preprocessing.py +++ b/services/bot-detector/bot_detector/preprocessing.py @@ -20,7 +20,7 @@ FEATURES = [ 'hits', 'hit_velocity', 'fuzzing_index', 'post_ratio', 'port_exhaustion_ratio', 'orphan_ratio', 'max_keepalives', 'tcp_shared_count', 'header_order_shared_count', 'header_count', 'has_accept_language', 'has_cookie', 'has_referer', - 'modern_browser_score', 'ua_ch_mismatch', 'ip_id_zero_ratio', + 'modern_browser_score', 'ua_ch_mismatch', 'has_sec_ch_ua', 'ip_id_zero_ratio', 'request_size_variance', 'multiplexing_efficiency', 'mss_mobile_mismatch', 'asset_ratio', 'direct_access_ratio', 'is_ua_rotating', 'distinct_ja4_count', 'src_port_density', 'ja4_asn_concentration', 'ja4_country_concentration', 'is_rare_ja4', @@ -103,7 +103,8 @@ def preprocess_df(df: pd.DataFrame) -> pd.DataFrame: # Imputation intelligente binary_features = { - 'has_accept_language', 'has_cookie', 'has_referer', 'ua_ch_mismatch', + 'has_accept_language', 'has_cookie', 'has_referer', 'has_sec_ch_ua', + 'ua_ch_mismatch', 'is_ua_rotating', 'is_alpn_missing', 'sni_host_mismatch', 'alpn_http_mismatch', 'mss_mobile_mismatch', 'anubis_is_flagged', 'is_rare_ja4', 'is_fake_navigation', 'has_xff', 'sec_ch_mobile_mismatch', diff --git a/services/bot-detector/bot_detector/tests/test_detector.py b/services/bot-detector/bot_detector/tests/test_detector.py index 12d4e60..9204ffa 100644 --- a/services/bot-detector/bot_detector/tests/test_detector.py +++ b/services/bot-detector/bot_detector/tests/test_detector.py @@ -606,23 +606,23 @@ def test_browser_consistency_score_range(): """browser_consistency_score is in [0, 5] and sums 5 binary signals.""" df = pd.DataFrame({ 'browser_family': ['Chromium', '', 'Firefox', 'Safari'], - 'modern_browser_score': [100, 0, 80, 50], + 'has_sec_ch_ua': [1, 0, 1, 0], 'has_accept_language': [1, 0, 1, 1], 'has_cookie': [1, 0, 1, 0], 'sec_fetch_absence_rate': [0.0, 1.0, 0.1, 0.6], }) is_known = (df['browser_family'] != '').astype(int) - mbs_ok = (df['modern_browser_score'] >= 50).astype(int) + scu_ok = (df['has_sec_ch_ua'] > 0).astype(int) hal_ok = (df['has_accept_language'] > 0).astype(int) hck_ok = (df['has_cookie'] > 0).astype(int) sfa_ok = (df['sec_fetch_absence_rate'] < 0.5).astype(int) - bcs = is_known + mbs_ok + hal_ok + hck_ok + sfa_ok + bcs = is_known + scu_ok + hal_ok + hck_ok + sfa_ok assert bcs.min() >= 0 and bcs.max() <= 5 assert bcs.iloc[0] == 5, "Chromium with all signals should score 5" assert bcs.iloc[1] == 0, "Empty browser with no signals should score 0" assert bcs.iloc[2] == 5, "Firefox with all signals should score 5" - assert bcs.iloc[3] == 3, "Safari without cookie and high sec_fetch_absence should score 3" + assert bcs.iloc[3] == 2, "Safari without CH/cookie and high sec_fetch_absence should score 2" def test_legitimate_browser_classification_threshold(): diff --git a/services/bot-detector/deploy_views.sql b/services/bot-detector/deploy_views.sql index 30452e0..29d6811 100644 --- a/services/bot-detector/deploy_views.sql +++ b/services/bot-detector/deploy_views.sql @@ -178,6 +178,7 @@ CREATE TABLE ja4_processing.agg_header_fingerprint_1h has_cookie SimpleAggregateFunction(max, UInt8), has_referer SimpleAggregateFunction(max, UInt8), modern_browser_score SimpleAggregateFunction(max, UInt8), + has_sec_ch_ua SimpleAggregateFunction(max, UInt8), ua_ch_mismatch SimpleAggregateFunction(max, UInt8), sec_fetch_mode SimpleAggregateFunction(any, String), sec_fetch_dest SimpleAggregateFunction(any, String) @@ -200,7 +201,8 @@ SELECT max(toUInt8(if(position(src.client_headers, 'Accept-Language') > 0, 1, 0))) AS has_accept_language, max(toUInt8(if(position(src.client_headers, 'Cookie') > 0, 1, 0))) AS has_cookie, max(toUInt8(if(position(src.client_headers, 'Referer') > 0, 1, 0))) AS has_referer, - max(toUInt8(if(length(src.header_sec_ch_ua) > 0, 100, if(length(src.header_user_agent) > 0, 50, 0)))) AS modern_browser_score, + max(toUInt8(if(length(src.header_sec_ch_ua) > 0, 100, if(length(src.header_sec_fetch_site) > 0, 70, 0)))) AS modern_browser_score, + max(toUInt8(if(length(src.header_sec_ch_ua) > 0, 1, 0))) AS has_sec_ch_ua, max(toUInt8(if((position(src.header_user_agent, 'Windows') > 0 AND position(src.header_sec_ch_ua_platform, 'Windows') == 0) OR (position(src.header_user_agent, 'iPhone') > 0 AND position(src.header_sec_ch_ua_platform, 'iOS') == 0), 1, 0))) AS ua_ch_mismatch, any(src.header_sec_fetch_mode) AS sec_fetch_mode, any(src.header_sec_fetch_dest) AS sec_fetch_dest @@ -219,7 +221,7 @@ CREATE TABLE ja4_processing.ml_detected_anomalies max_keepalives UInt32, orphan_ratio Float32, tcp_jitter_variance Float32, tcp_shared_count UInt32, true_window_size UInt64, window_mss_ratio Float32, alpn_http_mismatch UInt8, is_alpn_missing UInt8, sni_host_mismatch UInt8, header_count UInt16, has_accept_language UInt8, has_cookie UInt8, has_referer UInt8, modern_browser_score UInt8, - is_headless UInt8, ua_ch_mismatch UInt8, header_order_shared_count UInt32, ip_id_zero_ratio Float32, + has_sec_ch_ua UInt8, is_headless UInt8, ua_ch_mismatch UInt8, header_order_shared_count UInt32, ip_id_zero_ratio Float32, request_size_variance Float32, multiplexing_efficiency Float32, mss_mobile_mismatch UInt8, correlated UInt8, reason String, asset_ratio Float32, direct_access_ratio Float32, is_ua_rotating UInt8, distinct_ja4_count UInt32, src_port_density Float32, ja4_asn_concentration Float32, ja4_country_concentration Float32, is_rare_ja4 UInt8, @@ -303,12 +305,13 @@ WITH base_data AS ( a.true_window_size AS true_window_size, a.window_mss_ratio AS window_mss_ratio, a.max_keepalives AS max_keepalives, h.header_order_hash AS header_order_hash, h.header_count AS header_count, h.has_accept_language AS has_accept_language, h.has_cookie AS has_cookie, - h.has_referer AS has_referer, h.modern_browser_score AS modern_browser_score, h.ua_ch_mismatch AS ua_ch_mismatch, + h.has_referer AS has_referer, h.modern_browser_score AS modern_browser_score, + h.has_sec_ch_ua AS has_sec_ch_ua, h.ua_ch_mismatch AS ua_ch_mismatch, (a.count_post / (a.hits + 1)) AS post_ratio, (a.uniq_query_params / (a.uniq_paths + 1)) AS fuzzing_index, (a.hits / (dateDiff('second', a.first_seen, a.last_seen) + 1)) AS hit_velocity, (a.unique_src_ports / (a.hits + 1)) AS port_exhaustion_ratio, (a.orphan_count / (a.hits + 1)) AS orphan_ratio, (a.ip_id_zero_count / (a.hits + 1)) AS ip_id_zero_ratio, (a.hits / (a.unique_conn_id + 1)) AS multiplexing_efficiency, - IF(a.mss_1460_count > (a.hits * 0.8) AND h.modern_browser_score > 70, 1, 0) AS mss_mobile_mismatch, + IF(a.mss_1460_count > (a.hits * 0.8) AND h.has_sec_ch_ua > 0, 1, 0) AS mss_mobile_mismatch, a.request_size_variance AS request_size_variance, IF(a.tls_alpn = 'h2' AND a.http_version != '2', 1, 0) AS alpn_http_mismatch, IF(length(a.tls_alpn) = 0 OR a.tls_alpn = '00', 1, 0) AS is_alpn_missing, @@ -322,7 +325,7 @@ WITH base_data AS ( (sum(a.hits) OVER (PARTITION BY a.ja4, a.src_asn) / (sum(a.hits) OVER (PARTITION BY a.ja4) + 1)) AS ja4_asn_concentration, (sum(a.hits) OVER (PARTITION BY a.ja4, a.src_country_code) / (sum(a.hits) OVER (PARTITION BY a.ja4) + 1)) AS ja4_country_concentration, IF(sum(a.hits) OVER (PARTITION BY a.ja4) < 100, 1, 0) AS is_rare_ja4, - (count() OVER (PARTITION BY h.header_order_hash, a.first_ua) / (count() OVER (PARTITION BY a.first_ua) + 1)) AS header_order_confidence, + (count() OVER (PARTITION BY h.header_order_hash, a.ja4) / (count() OVER (PARTITION BY a.ja4) + 1)) AS header_order_confidence, uniqExact(h.header_order_hash) OVER (PARTITION BY a.src_ip) AS distinct_header_orders, (a.uniq_paths / (a.hits + 1)) AS path_diversity_ratio, a.url_depth_variance AS url_depth_variance, @@ -375,7 +378,8 @@ WITH base_data AS ( window_start, src_ip, any(header_order_hash) AS header_order_hash, max(header_count) AS header_count, max(has_accept_language) AS has_accept_language, max(has_cookie) AS has_cookie, max(has_referer) AS has_referer, - max(modern_browser_score) AS modern_browser_score, max(ua_ch_mismatch) AS ua_ch_mismatch, + max(modern_browser_score) AS modern_browser_score, max(has_sec_ch_ua) AS has_sec_ch_ua, + max(ua_ch_mismatch) AS ua_ch_mismatch, any(sec_fetch_mode) AS sec_fetch_mode, any(sec_fetch_dest) AS sec_fetch_dest FROM ja4_processing.agg_header_fingerprint_1h WHERE window_start >= now() - INTERVAL 24 HOUR @@ -450,12 +454,13 @@ WITH base_data AS ( a.true_window_size AS true_window_size, a.window_mss_ratio AS window_mss_ratio, a.max_keepalives AS max_keepalives, h.header_order_hash AS header_order_hash, h.header_count AS header_count, h.has_accept_language AS has_accept_language, h.has_cookie AS has_cookie, - h.has_referer AS has_referer, h.modern_browser_score AS modern_browser_score, h.ua_ch_mismatch AS ua_ch_mismatch, + h.has_referer AS has_referer, h.modern_browser_score AS modern_browser_score, + h.has_sec_ch_ua AS has_sec_ch_ua, h.ua_ch_mismatch AS ua_ch_mismatch, (a.count_post / (a.hits + 1)) AS post_ratio, (a.uniq_query_params / (a.uniq_paths + 1)) AS fuzzing_index, (a.hits / (dateDiff('second', a.first_seen, a.last_seen) + 1)) AS hit_velocity, (a.unique_src_ports / (a.hits + 1)) AS port_exhaustion_ratio, (a.orphan_count / (a.hits + 1)) AS orphan_ratio, (a.ip_id_zero_count / (a.hits + 1)) AS ip_id_zero_ratio, (a.hits / (a.unique_conn_id + 1)) AS multiplexing_efficiency, - IF(a.mss_1460_count > (a.hits * 0.8) AND h.modern_browser_score > 70, 1, 0) AS mss_mobile_mismatch, + IF(a.mss_1460_count > (a.hits * 0.8) AND h.has_sec_ch_ua > 0, 1, 0) AS mss_mobile_mismatch, a.request_size_variance AS request_size_variance, IF(a.tls_alpn = 'h2' AND a.http_version != '2', 1, 0) AS alpn_http_mismatch, IF(length(a.tls_alpn) = 0 OR a.tls_alpn = '00', 1, 0) AS is_alpn_missing, @@ -469,7 +474,7 @@ WITH base_data AS ( (sum(a.hits) OVER (PARTITION BY a.ja4, a.src_asn) / (sum(a.hits) OVER (PARTITION BY a.ja4) + 1)) AS ja4_asn_concentration, (sum(a.hits) OVER (PARTITION BY a.ja4, a.src_country_code) / (sum(a.hits) OVER (PARTITION BY a.ja4) + 1)) AS ja4_country_concentration, IF(sum(a.hits) OVER (PARTITION BY a.ja4) < 100, 1, 0) AS is_rare_ja4, - (count() OVER (PARTITION BY h.header_order_hash, a.first_ua) / (count() OVER (PARTITION BY a.first_ua) + 1)) AS header_order_confidence, + (count() OVER (PARTITION BY h.header_order_hash, a.ja4) / (count() OVER (PARTITION BY a.ja4) + 1)) AS header_order_confidence, uniqExact(h.header_order_hash) OVER (PARTITION BY a.src_ip) AS distinct_header_orders, (a.uniq_paths / (a.hits + 1)) AS path_diversity_ratio, a.url_depth_variance AS url_depth_variance, @@ -519,7 +524,8 @@ WITH base_data AS ( src_ip, any(header_order_hash) AS header_order_hash, max(header_count) AS header_count, max(has_accept_language) AS has_accept_language, max(has_cookie) AS has_cookie, max(has_referer) AS has_referer, - max(modern_browser_score) AS modern_browser_score, max(ua_ch_mismatch) AS ua_ch_mismatch, + max(modern_browser_score) AS modern_browser_score, max(has_sec_ch_ua) AS has_sec_ch_ua, + max(ua_ch_mismatch) AS ua_ch_mismatch, any(sec_fetch_mode) AS sec_fetch_mode, any(sec_fetch_dest) AS sec_fetch_dest FROM ja4_processing.agg_header_fingerprint_1h WHERE window_start >= now() - INTERVAL 24 HOUR diff --git a/services/correlator/sql/migrations/03_remove_ua_browser_detection.sql b/services/correlator/sql/migrations/03_remove_ua_browser_detection.sql new file mode 100644 index 0000000..80bcc8b --- /dev/null +++ b/services/correlator/sql/migrations/03_remove_ua_browser_detection.sql @@ -0,0 +1,45 @@ +-- === 03_remove_ua_browser_detection.sql — Suppression dépendance User-Agent === +-- +-- Contexte : l'identification navigateur ne doit PAS se baser sur le User-Agent +-- (trivalement falsifiable). Cette migration : +-- 1. Ajoute has_sec_ch_ua à agg_header_fingerprint_1h +-- 2. Recréé la MV avec modern_browser_score basé sur Client Hints + Sec-Fetch +-- 3. Recréé sec_ch_mobile_mismatch sans UA (Client Hints only) +-- 4. Ajoute has_sec_ch_ua à ml_all_scores +-- --------------------------------------------------------------------------- + +-- 1. Nouvelle colonne has_sec_ch_ua +ALTER TABLE ja4_processing.agg_header_fingerprint_1h + ADD COLUMN IF NOT EXISTS has_sec_ch_ua SimpleAggregateFunction(max, UInt8) + AFTER modern_browser_score; + +-- 2. Recréer la MV sans dépendance UA +DROP VIEW IF EXISTS ja4_processing.mv_agg_header_fingerprint_1h; + +CREATE MATERIALIZED VIEW ja4_processing.mv_agg_header_fingerprint_1h +TO ja4_processing.agg_header_fingerprint_1h AS +SELECT + toStartOfHour(src.time) AS window_start, + toIPv6(src.src_ip) AS src_ip, + any(toString(cityHash64(src.client_headers))) AS header_order_hash, + max(toUInt16(length(src.client_headers) - length(replaceAll(src.client_headers, ',', '')) + 1)) AS header_count, + max(toUInt8(if(position(src.client_headers, 'Accept-Language') > 0, 1, 0))) AS has_accept_language, + max(toUInt8(if(position(src.client_headers, 'Cookie') > 0, 1, 0))) AS has_cookie, + max(toUInt8(if(position(src.client_headers, 'Referer') > 0, 1, 0))) AS has_referer, + -- modern_browser_score : sec-ch-ua → 100, Sec-Fetch → 70, sinon → 0 (PAS de UA) + max(toUInt8(if(length(src.header_sec_ch_ua) > 0, 100, if(length(src.header_sec_fetch_site) > 0, 70, 0)))) AS modern_browser_score, + max(toUInt8(if(length(src.header_sec_ch_ua) > 0, 1, 0))) AS has_sec_ch_ua, + max(toUInt8(if((position(src.header_user_agent, 'Windows') > 0 AND position(src.header_sec_ch_ua_platform, 'Windows') == 0) OR (position(src.header_user_agent, 'iPhone') > 0 AND position(src.header_sec_ch_ua_platform, 'iOS') == 0), 1, 0))) AS ua_ch_mismatch, + -- sec_ch_mobile_mismatch : incohérence interne Client Hints (pas de UA) + max(toUInt8(if( + (src.header_sec_ch_ua_mobile = '?1' AND position(src.header_sec_ch_ua_platform, 'Windows') > 0) + OR (src.header_sec_ch_ua_mobile = '?0' AND position(src.header_sec_ch_ua_platform, 'Android') > 0), + 1, 0))) AS sec_ch_mobile_mismatch, + any(src.header_sec_fetch_mode) AS sec_fetch_mode, + any(src.header_sec_fetch_dest) AS sec_fetch_dest +FROM ja4_logs.http_logs AS src +GROUP BY window_start, src.src_ip; + +-- 3. Ajouter has_sec_ch_ua à ml_all_scores +ALTER TABLE ja4_processing.ml_all_scores + ADD COLUMN IF NOT EXISTS has_sec_ch_ua UInt8 DEFAULT 0; diff --git a/shared/clickhouse/05_aggregation_tables.sql b/shared/clickhouse/05_aggregation_tables.sql index d17a1f7..32a40ce 100644 --- a/shared/clickhouse/05_aggregation_tables.sql +++ b/shared/clickhouse/05_aggregation_tables.sql @@ -202,6 +202,7 @@ CREATE TABLE IF NOT EXISTS ja4_processing.agg_header_fingerprint_1h has_cookie SimpleAggregateFunction(max, UInt8), has_referer SimpleAggregateFunction(max, UInt8), modern_browser_score SimpleAggregateFunction(max, UInt8), + has_sec_ch_ua SimpleAggregateFunction(max, UInt8), ua_ch_mismatch SimpleAggregateFunction(max, UInt8), sec_ch_mobile_mismatch SimpleAggregateFunction(max, UInt8), sec_fetch_mode SimpleAggregateFunction(any, String), @@ -222,11 +223,12 @@ SELECT max(toUInt8(if(position(src.client_headers, 'Accept-Language') > 0, 1, 0))) AS has_accept_language, max(toUInt8(if(position(src.client_headers, 'Cookie') > 0, 1, 0))) AS has_cookie, max(toUInt8(if(position(src.client_headers, 'Referer') > 0, 1, 0))) AS has_referer, - max(toUInt8(if(length(src.header_sec_ch_ua) > 0, 100, if(length(src.header_user_agent) > 0, 50, 0)))) AS modern_browser_score, + max(toUInt8(if(length(src.header_sec_ch_ua) > 0, 100, if(length(src.header_sec_fetch_site) > 0, 70, 0)))) AS modern_browser_score, + max(toUInt8(if(length(src.header_sec_ch_ua) > 0, 1, 0))) AS has_sec_ch_ua, max(toUInt8(if((position(src.header_user_agent, 'Windows') > 0 AND position(src.header_sec_ch_ua_platform, 'Windows') == 0) OR (position(src.header_user_agent, 'iPhone') > 0 AND position(src.header_sec_ch_ua_platform, 'iOS') == 0), 1, 0))) AS ua_ch_mismatch, max(toUInt8(if( - (src.header_sec_ch_ua_mobile = '?1' AND position(src.header_user_agent, 'Mobile') == 0 AND position(src.header_user_agent, 'Android') == 0 AND position(src.header_user_agent, 'iPhone') == 0) - OR (src.header_sec_ch_ua_mobile = '?0' AND (position(src.header_user_agent, 'iPhone') > 0 OR position(src.header_user_agent, 'Android') > 0)), + (src.header_sec_ch_ua_mobile = '?1' AND position(src.header_sec_ch_ua_platform, 'Windows') > 0) + OR (src.header_sec_ch_ua_mobile = '?0' AND position(src.header_sec_ch_ua_platform, 'Android') > 0), 1, 0))) AS sec_ch_mobile_mismatch, any(src.header_sec_fetch_mode) AS sec_fetch_mode, any(src.header_sec_fetch_dest) AS sec_fetch_dest diff --git a/shared/clickhouse/06_ml_tables.sql b/shared/clickhouse/06_ml_tables.sql index a7ce601..f07026d 100644 --- a/shared/clickhouse/06_ml_tables.sql +++ b/shared/clickhouse/06_ml_tables.sql @@ -33,7 +33,8 @@ CREATE TABLE IF NOT EXISTS ja4_processing.ml_detected_anomalies tcp_jitter_variance Float32, tcp_shared_count UInt32, true_window_size UInt64, window_mss_ratio Float32, alpn_http_mismatch UInt8, is_alpn_missing UInt8, sni_host_mismatch UInt8, header_count UInt16, has_accept_language UInt8, - has_cookie UInt8, has_referer UInt8, modern_browser_score UInt8, is_headless UInt8, + has_cookie UInt8, has_referer UInt8, modern_browser_score UInt8, has_sec_ch_ua UInt8, + is_headless UInt8, ua_ch_mismatch UInt8, header_order_shared_count UInt32, ip_id_zero_ratio Float32, request_size_variance Float32, multiplexing_efficiency Float32, mss_mobile_mismatch UInt8, correlated UInt8, reason String, diff --git a/shared/clickhouse/07_ai_features_view.sql b/shared/clickhouse/07_ai_features_view.sql index 8b9aebf..50792ba 100644 --- a/shared/clickhouse/07_ai_features_view.sql +++ b/shared/clickhouse/07_ai_features_view.sql @@ -44,6 +44,7 @@ WITH base_data AS ( h.header_order_hash AS header_order_hash, h.header_count AS header_count, h.has_accept_language AS has_accept_language, h.has_cookie AS has_cookie, h.has_referer AS has_referer, h.modern_browser_score AS modern_browser_score, + h.has_sec_ch_ua AS has_sec_ch_ua, h.ua_ch_mismatch AS ua_ch_mismatch, (a.count_post / (a.hits + 1)) AS post_ratio, (a.uniq_query_params / (a.uniq_paths + 1)) AS fuzzing_index, @@ -52,7 +53,7 @@ WITH base_data AS ( (a.orphan_count / (a.hits + 1)) AS orphan_ratio, (a.ip_id_zero_count / (a.hits + 1)) AS ip_id_zero_ratio, (a.hits / (a.unique_conn_id + 1)) AS multiplexing_efficiency, - IF(a.mss_1460_count > (a.hits * 0.8) AND h.modern_browser_score > 70, 1, 0) AS mss_mobile_mismatch, + IF(a.mss_1460_count > (a.hits * 0.8) AND h.has_sec_ch_ua > 0, 1, 0) AS mss_mobile_mismatch, a.request_size_variance AS request_size_variance, IF(a.tls_alpn = 'h2' AND a.http_version != '2', 1, 0) AS alpn_http_mismatch, IF(length(a.tls_alpn) = 0 OR a.tls_alpn = '00', 1, 0) AS is_alpn_missing, @@ -68,7 +69,7 @@ WITH base_data AS ( (sum(a.hits) OVER (PARTITION BY a.ja4, a.src_asn) / (sum(a.hits) OVER (PARTITION BY a.ja4) + 1)) AS ja4_asn_concentration, (sum(a.hits) OVER (PARTITION BY a.ja4, a.src_country_code) / (sum(a.hits) OVER (PARTITION BY a.ja4) + 1)) AS ja4_country_concentration, IF(sum(a.hits) OVER (PARTITION BY a.ja4) < 100, 1, 0) AS is_rare_ja4, - (count() OVER (PARTITION BY h.header_order_hash, a.first_ua) / (count() OVER (PARTITION BY a.first_ua) + 1)) AS header_order_confidence, + (count() OVER (PARTITION BY h.header_order_hash, a.ja4) / (count() OVER (PARTITION BY a.ja4) + 1)) AS header_order_confidence, uniqExact(h.header_order_hash) OVER (PARTITION BY a.src_ip) AS distinct_header_orders, (a.uniq_paths / (a.hits + 1)) AS path_diversity_ratio, a.url_depth_variance AS url_depth_variance, @@ -141,7 +142,8 @@ WITH base_data AS ( window_start, src_ip, any(header_order_hash) AS header_order_hash, max(header_count) AS header_count, max(has_accept_language) AS has_accept_language, max(has_cookie) AS has_cookie, max(has_referer) AS has_referer, - max(modern_browser_score) AS modern_browser_score, max(ua_ch_mismatch) AS ua_ch_mismatch, + max(modern_browser_score) AS modern_browser_score, max(has_sec_ch_ua) AS has_sec_ch_ua, + max(ua_ch_mismatch) AS ua_ch_mismatch, max(sec_ch_mobile_mismatch) AS sec_ch_mobile_mismatch, any(sec_fetch_mode) AS sec_fetch_mode, any(sec_fetch_dest) AS sec_fetch_dest FROM ja4_processing.agg_header_fingerprint_1h