diff --git a/services/bot-detector/bot_detector/browser_matcher.py b/services/bot-detector/bot_detector/browser_matcher.py new file mode 100644 index 0000000..0ba7946 --- /dev/null +++ b/services/bot-detector/bot_detector/browser_matcher.py @@ -0,0 +1,383 @@ +"""Moteur de correspondance de signatures navigateur (browser_matcher). + +Remplace progressivement le système browser_confidence pour la décision de +bypass LEGITIMATE_BROWSER. Pendant la phase de validation, les deux systèmes +tournent en parallèle (BROWSER_MATCHER_REPLACE=false) : les décisions sont +journalisées côte à côte sans modifier le comportement existant. + +Fonctionne en mode vectorisé sur un DataFrame pandas entier. +Les colonnes requises sont fournies par view_ai_features_1h (07_ai_features_view.sql). + +Variables d'environnement : + BROWSER_MATCHER_ENABLED : activer le matcher (défaut: true) + BROWSER_MATCHER_REPLACE : remplacer browser_confidence pour le bypass (défaut: false) +""" +import os +import pandas as pd +import numpy as np + +from .browser_signatures import ( + BROWSER_SIGNATURES, + NON_BROWSER_SIGNATURES, + BROWSER_THRESHOLDS, + DIMENSION_WEIGHTS, +) +from .log import log_info + +BROWSER_MATCHER_ENABLED: bool = os.getenv("BROWSER_MATCHER_ENABLED", "true").lower() == "true" +BROWSER_MATCHER_REPLACE: bool = os.getenv("BROWSER_MATCHER_REPLACE", "false").lower() == "true" + +# Familles Chrome-like dans le dictionnaire dict_browser_ja4. +_CHROME_JA4_FAMILIES = {"Chromium", "Chrome", "Edge"} + + +def _col(df: pd.DataFrame, name: str, default=0) -> pd.Series: + """Retourne une colonne du DataFrame ou une série de valeurs par défaut.""" + if name in df.columns: + return df[name].fillna(default) + return pd.Series(default, index=df.index, dtype=float) + + +# ───────────────────────────────────────────────────────────────────────────── +# Scoring par dimension +# ───────────────────────────────────────────────────────────────────────────── + +def _d1_h2_settings(df: pd.DataFrame, family: str) -> pd.Series: + """Dimension 1 — correspondance SETTINGS H2 (poids 0.30). + + Utilise h2_dict_family issu du dictionnaire dict_browser_h2. + Score 1.0 si famille correspond, 0.0 sinon. + Désactivé (score neutre 0.5) quand le trafic transite par un proxy CDN. + """ + has_xff = _col(df, "has_xff") + h2_dict_family = _col(df, "h2_dict_family", "") + + # Correspondance exacte de la famille dans le dictionnaire H2 + match = (h2_dict_family.astype(str) == family).astype(float) + + # Signal de cohérence JA4↔H2 comme signal de renforcement + h2_ja4_coherence = _col(df, "h2_ja4_coherence") + + base = match * 0.8 + h2_ja4_coherence * 0.2 + + # Neutre (0.5) si proxy CDN car le fingerprint H2 client est perdu + return pd.Series( + np.where(has_xff > 0, 0.5, base.values), + index=df.index, + ) + + +def _d2_h2_window(df: pd.DataFrame, family: str) -> pd.Series: + """Dimension 2 — valeur WINDOW_UPDATE H2 (poids 0.15). + + Score 1.0 si la valeur correspond à la famille attendue. + Score 0.0 si absent (= outil non-navigateur) ou mauvaise famille. + """ + sig = BROWSER_SIGNATURES[family] + wu = _col(df, "h2_window_update_value") + tol = sig["h2_window_update_tolerance"] + expected = sig["h2_window_update"] + + match = ((wu - expected).abs() <= tol).astype(float) + absent = (wu == 0).astype(float) + + # 0 si absent, score sinon + return pd.Series( + np.where(absent > 0, 0.0, match.values), + index=df.index, + ) + + +def _d3_pseudo_order(df: pd.DataFrame, family: str) -> pd.Series: + """Dimension 3 — ordre des pseudo-headers H2 (poids 0.15). + + Correspondance exacte → 1.0. + Mauvais ordre mais non vide → 0.0. + Données H2 absentes → neutre 0.5. + """ + sig = BROWSER_SIGNATURES[family] + expected = sig["pseudo_header_order"] + + # Colonne h2_pseudo_ord_raw si disponible, sinon reconstruire depuis signaux atomiques + if "h2_pseudo_ord_raw" in df.columns: + h2_order = df["h2_pseudo_ord_raw"].fillna("").astype(str) + absent = h2_order == "" + match = (h2_order == expected).astype(float) + return pd.Series( + np.where(absent, 0.5, match.values), + index=df.index, + ) + + # Fallback sur les signaux atomiques précalculés en SQL + if family == "Chrome": + # Chrome et Safari partagent l'ordre m,a,s,p : utiliser WU pour distinguer + h2_order_cs = _col(df, "h2_order_chromesafari") + wu = _col(df, "h2_window_update_value") + chrome_wu = ((wu - 15663105).abs() <= 1000).astype(float) + return (h2_order_cs * chrome_wu).clip(0, 1) + elif family == "Firefox": + return _col(df, "h2_order_firefox") + elif family == "Safari": + h2_order_cs = _col(df, "h2_order_chromesafari") + wu = _col(df, "h2_window_update_value") + safari_wu = ((wu - 10485760).abs() <= 1000).astype(float) + return (h2_order_cs * safari_wu).clip(0, 1) + + return pd.Series(0.5, index=df.index, dtype=float) + + +def _d4_h2_priority(df: pd.DataFrame, family: str) -> pd.Series: + """Dimension 4 — présence de PRIORITY frames H2 (poids 0.10). + + Firefox attendait des PRIORITY frames (versions anciennes). + Chrome et Safari n'en envoient pas. + Score 1.0 si présence/absence correspond à l'attendu. + """ + sig = BROWSER_SIGNATURES[family] + expected_present = sig["h2_priority_frames_expected"] + + if "h2_priority_present" not in df.columns: + return pd.Series(0.5, index=df.index, dtype=float) + + priority_present = _col(df, "h2_priority_present").astype(bool) + match = (priority_present == expected_present).astype(float) + + # Neutre si pas de données H2 + h2_known = _col(df, "h2_settings_known") + wu = _col(df, "h2_window_update_value") + no_h2 = ((h2_known == 0) & (wu == 0)).astype(float) + + return pd.Series( + np.where(no_h2 > 0, 0.5, match.values), + index=df.index, + ) + + +def _d5_http_headers(df: pd.DataFrame, family: str) -> pd.Series: + """Dimension 5 — cohérence des headers HTTP (poids 0.15). + + +0.50 : tous les headers requis sont présents + +0.50 : aucun header interdit n'est présent + """ + sig = BROWSER_SIGNATURES[family] + + score = pd.Series(0.0, index=df.index, dtype=float) + + # Accept-Language requis pour tous les navigateurs + if sig["accept_language_required"]: + has_lang = _col(df, "has_accept_language") + score += has_lang.clip(0, 1) * 0.25 + + # Sec-Fetch-* : Chrome et Firefox les envoient, Safari non + if sig["headers_sec_fetch_required"]: + sec_fetch_rate = 1.0 - _col(df, "sec_fetch_absence_rate", 0.5).clip(0, 1) + score += sec_fetch_rate * 0.25 + elif sig.get("headers_sec_fetch_forbidden"): + # Safari : pénaliser si Sec-Fetch présent + sec_fetch_absent = _col(df, "sec_fetch_absence_rate", 1.0).clip(0, 1) + score += sec_fetch_absent * 0.25 + else: + score += 0.125 # neutre + + # Sec-CH-UA : Chrome l'envoie, Firefox/Safari non + has_ch_ua = _col(df, "has_sec_ch_ua") + if sig["headers_ch_ua_required"]: + score += has_ch_ua.clip(0, 1) * 0.25 + else: + # Pénaliser si présent alors qu'il ne devrait pas l'être + absent_ch_ua = (1 - has_ch_ua.clip(0, 1)) + score += absent_ch_ua * 0.25 + + return score.clip(0, 1) + + +def _d6_tls_structure(df: pd.DataFrame, family: str) -> pd.Series: + """Dimension 6 — structure TLS (poids 0.10). + + Utilise browser_family (du dictionnaire dict_browser_ja4) comme proxy + de la famille TLS, puisque le JA4 encode la version TLS et les ciphers. + """ + sig = BROWSER_SIGNATURES[family] + expected_families = set(sig["tls"]["ja4_families"]) + + browser_family = _col(df, "browser_family", "").astype(str) + family_match = browser_family.isin(expected_families).astype(float) + + # TLS 1.2 = signal négatif pour tous les navigateurs modernes + tls12_ratio = _col(df, "tls12_ratio", 0.0).clip(0, 1) + tls13_bonus = 1.0 - tls12_ratio + + # GREASE : Chrome/Edge l'utilisent, Firefox/Safari non + if sig["tls"]["grease_expected"]: + # Pas de signal GREASE explicite dans le schéma actuel — se fier au family_match + return (family_match * 0.7 + tls13_bonus * 0.3).clip(0, 1) + else: + return (family_match * 0.7 + tls13_bonus * 0.3).clip(0, 1) + + +def _d7_ja4_dict(df: pd.DataFrame, family: str) -> pd.Series: + """Dimension 7 — correspondance dans le dictionnaire dict_browser_ja4 (poids 0.05).""" + sig = BROWSER_SIGNATURES[family] + expected_families = set(sig["tls"]["ja4_families"]) + + browser_family = _col(df, "browser_family", "").astype(str) + return browser_family.isin(expected_families).astype(float) + + +# ───────────────────────────────────────────────────────────────────────────── +# Score composite +# ───────────────────────────────────────────────────────────────────────────── + +def _compute_family_score(df: pd.DataFrame, family: str) -> pd.Series: + """Calcule le score de correspondance [0.0, 1.0] pour une famille navigateur. + + Score = somme pondérée des 7 dimensions. + """ + w = DIMENSION_WEIGHTS + score = ( + _d1_h2_settings(df, family) * w["h2_settings"] + + _d2_h2_window(df, family) * w["h2_window"] + + _d3_pseudo_order(df, family) * w["pseudo_order"] + + _d4_h2_priority(df, family) * w["h2_priority"] + + _d5_http_headers(df, family) * w["http_headers"] + + _d6_tls_structure(df, family) * w["tls_structure"] + + _d7_ja4_dict(df, family) * w["ja4_dict"] + ) + return score.clip(0.0, 1.0) + + +def _matches_non_browser(df: pd.DataFrame) -> pd.Series: + """Détecte les signatures d'outils non-navigateur (curl, httpx, go). + + Retourne un masque booléen. + """ + wu = _col(df, "h2_window_update_value") + + # go net/http a une valeur de WINDOW_UPDATE très distincte + go_sig = (wu - NON_BROWSER_SIGNATURES["go_net_http"]["h2_window_update"]).abs() < 1000 + + # curl / httpx : WINDOW_UPDATE absent (= 0) alors que des données H2 existent + h2_present = _col(df, "h2_settings_known").astype(bool) | (wu > 0) + curl_sig = (wu == 0) & h2_present + + return (go_sig | curl_sig).astype(bool) + + +# ───────────────────────────────────────────────────────────────────────────── +# Point d'entrée principal +# ───────────────────────────────────────────────────────────────────────────── + +def run_browser_matcher(df: pd.DataFrame) -> pd.DataFrame: + """Exécute le matcher de signatures sur le DataFrame entier (vectorisé). + + Ajoute les colonnes : + bm_family (str) — famille identifiée, '' si aucune + bm_score (float) — score de la meilleure correspondance [0.0, 1.0] + bm_partial (bool) — True si score entre 0.45 et le seuil (zone grise) + bm_non_browser (bool) — True si signature d'outil détectée + bm_decision (str) — 'LEGITIMATE_BROWSER', 'PARTIAL', 'NON_BROWSER', 'UNKNOWN' + + Retourne le DataFrame avec ces colonnes ajoutées. + """ + if not BROWSER_MATCHER_ENABLED or df.empty: + df["bm_family"] = "" + df["bm_score"] = 0.0 + df["bm_partial"] = False + df["bm_non_browser"] = False + df["bm_decision"] = "UNKNOWN" + return df + + scores: dict[str, pd.Series] = {} + for family in BROWSER_SIGNATURES: + scores[family] = _compute_family_score(df, family) + + score_df = pd.DataFrame(scores, index=df.index) + best_family = score_df.idxmax(axis=1) + best_score = score_df.max(axis=1) + + non_browser_mask = _matches_non_browser(df) + + # Réduire le score si l'IP utilise une signature d'outil + adjusted_score = pd.Series( + np.where(non_browser_mask, best_score.values * 0.6, best_score.values), + index=df.index, + ) + adjusted_family = best_family.copy() + adjusted_family[non_browser_mask] = "" + + # Calculer le seuil par famille + thresholds = adjusted_family.map(lambda f: BROWSER_THRESHOLDS.get(f, 1.0)) + above_threshold = adjusted_score >= thresholds + partial_mask = (~above_threshold) & (adjusted_score >= 0.45) & (~non_browser_mask) + + # Décision finale + def _decide(row): + family, score, is_nb, is_above, is_partial = row + if is_nb: + return "NON_BROWSER" + if is_above and family: + return "LEGITIMATE_BROWSER" + if is_partial and family: + return "PARTIAL" + return "UNKNOWN" + + decision = pd.Series( + [ + _decide(r) + for r in zip( + adjusted_family, + adjusted_score, + non_browser_mask, + above_threshold, + partial_mask, + ) + ], + index=df.index, + ) + + df["bm_family"] = adjusted_family.where(above_threshold | partial_mask, "") + df["bm_score"] = adjusted_score.round(4) + df["bm_partial"] = partial_mask + df["bm_non_browser"] = non_browser_mask + df["bm_decision"] = decision + + return df + + +def log_dual_mode_comparison( + df: pd.DataFrame, cycle_id: str, model_name: str +) -> None: + """Journalise la comparaison entre browser_confidence (ancien) et browser_matcher (nouveau). + + Appelé en mode DUAL_MODE pour suivre les divergences entre les deux systèmes + avant de basculer vers browser_matcher comme décideur unique. + """ + if "browser_confidence" not in df.columns or "bm_decision" not in df.columns: + return + + from .config import BROWSER_CONFIDENCE_THRESHOLD + + old_legit = ( + df["browser_confidence"] >= BROWSER_CONFIDENCE_THRESHOLD + ) & df.get("inferred_browser_family", pd.Series("", index=df.index)).fillna("").astype(str).ne("") + new_legit = df["bm_decision"] == "LEGITIMATE_BROWSER" + + only_old = int((old_legit & ~new_legit).sum()) + only_new = int((~old_legit & new_legit).sum()) + both = int((old_legit & new_legit).sum()) + neither = int((~old_legit & ~new_legit).sum()) + + log_info( + f"[{model_name}][DUAL_MODE] browser_confidence vs browser_matcher —" + f" concordance={both}, seul_ancien={only_old}," + f" seul_nouveau={only_new}, aucun={neither}" + ) + + if only_old > 0: + sample = df[old_legit & ~new_legit][["src_ip", "ja4", "browser_confidence", + "bm_family", "bm_score"]].head(3) + log_info(f"[{model_name}][DUAL_MODE] Exemples seul_ancien:\n{sample.to_string()}") + if only_new > 0: + sample = df[~old_legit & new_legit][["src_ip", "ja4", "browser_confidence", + "bm_family", "bm_score"]].head(3) + log_info(f"[{model_name}][DUAL_MODE] Exemples seul_nouveau:\n{sample.to_string()}") diff --git a/services/bot-detector/bot_detector/browser_signatures.py b/services/bot-detector/bot_detector/browser_signatures.py new file mode 100644 index 0000000..3b998ee --- /dev/null +++ b/services/bot-detector/bot_detector/browser_signatures.py @@ -0,0 +1,105 @@ +"""Signatures statiques de navigateurs pour la détection passive d'imposteurs. + +Module de données pures : aucun import, aucune logique. +Les valeurs de fingerprint H2 correspondent aux captures réelles +(format Akamai : SETTINGS|WINDOW_UPDATE|PRIORITY|PSEUDO_ORDER, + SETTINGS au format clé:valeur séparés par des virgules). +""" + +# Signatures de navigateurs légitimes. +# Clés des SETTINGS H2 : +# 1 = HEADER_TABLE_SIZE 2 = ENABLE_PUSH +# 3 = MAX_CONCURRENT_STREAMS 4 = INITIAL_WINDOW_SIZE +# 5 = MAX_FRAME_SIZE 6 = MAX_HEADER_LIST_SIZE +# 9 = UNKNOWN (Safari uniquement) +BROWSER_SIGNATURES: dict = { + "Chrome": { + # Chrome 119+ (Blink+BoringSSL) — stable depuis Chrome 106 + "h2_settings_exact": {1: 65536, 2: 0, 4: 6291456, 6: 262144}, + "h2_settings_forbidden_keys": [3, 5], + "h2_window_update": 15663105, + "h2_window_update_tolerance": 1000, + "h2_priority_frames_expected": False, + "pseudo_header_order": "m,a,s,p", + "tls": { + "ja4_families": ["Chromium", "Chrome", "Edge"], + "grease_expected": True, + }, + "headers_required": ["Accept-Language"], + "headers_sec_fetch_required": True, # Sec-Fetch-Site/Mode/Dest + "headers_ch_ua_required": True, # Sec-CH-UA + "headers_sec_fetch_forbidden": False, + "accept_language_required": True, + }, + "Firefox": { + # Firefox 90+ (Gecko+NSS) + "h2_settings_exact": {1: 65536, 4: 131072, 5: 16384}, + "h2_settings_forbidden_keys": [2, 3, 6], + "h2_window_update": 12517377, + "h2_window_update_tolerance": 1000, + "h2_priority_frames_expected": False, # CSV réel : PRIORITY = 0 + "pseudo_header_order": "m,p,s,a", # ordre réel capturé (pas m,p,a,s) + "tls": { + "ja4_families": ["Firefox"], + "grease_expected": False, + }, + "headers_required": ["Accept-Language"], + "headers_sec_fetch_required": True, + "headers_ch_ua_required": False, # Firefox ne supporte pas les Client Hints + "headers_sec_fetch_forbidden": False, + "accept_language_required": True, + }, + "Safari": { + # Safari 15+ (WebKit, iOS et macOS) + # SETTINGS observés : 1:4096, 3:100, 4:65535 + # La clé 4 (INITIAL_WINDOW_SIZE=65535) distingue Safari de Chrome/Firefox + "h2_settings_exact": {1: 4096, 3: 100, 4: 65535}, + "h2_settings_forbidden_keys": [2, 5, 6], + "h2_window_update": 10485760, + "h2_window_update_tolerance": 1000, + "h2_priority_frames_expected": False, + "pseudo_header_order": "m,a,s,p", # identique à Chrome (WU=10485760 distingue) + "tls": { + "ja4_families": ["Safari"], + "grease_expected": False, + }, + "headers_required": ["Accept-Language"], + "headers_sec_fetch_required": False, # Safari ne supporte pas Sec-Fetch-* + "headers_ch_ua_required": False, # Safari ne supporte pas les Client Hints + "headers_sec_fetch_forbidden": True, # Présence = incohérence + "accept_language_required": True, + }, +} + +# Signatures d'outils non-navigateur (signaux de rejet rapide). +NON_BROWSER_SIGNATURES: dict = { + "curl": { + "h2_window_update": 0, # pas de WINDOW_UPDATE frame + "pseudo_header_order": "m,p,s,a", + }, + "python_httpx": { + "h2_window_update": 0, + "h2_settings_window": 65535, # INITIAL_WINDOW_SIZE typique + }, + "go_net_http": { + "h2_window_update": 1073676289, # valeur distincte go net/http + }, +} + +# Seuils de décision : score minimum pour valider un bypass LEGITIMATE_BROWSER. +BROWSER_THRESHOLDS: dict = { + "Chrome": 0.72, + "Firefox": 0.68, + "Safari": 0.68, +} + +# Poids des dimensions de scoring (somme = 1.00). +DIMENSION_WEIGHTS: dict = { + "h2_settings": 0.30, + "h2_window": 0.15, + "pseudo_order": 0.15, + "h2_priority": 0.10, + "http_headers": 0.15, + "tls_structure": 0.10, + "ja4_dict": 0.05, +} diff --git a/services/bot-detector/bot_detector/pipeline.py b/services/bot-detector/bot_detector/pipeline.py index 8c42ac7..8b33ede 100644 --- a/services/bot-detector/bot_detector/pipeline.py +++ b/services/bot-detector/bot_detector/pipeline.py @@ -22,6 +22,7 @@ from .scoring import ( compute_exiffi_importance, compute_ae_feature_errors, get_meta_learner, FINGERPRINT_COHERENCE_THRESHOLD, ) +from .browser_matcher import log_dual_mode_comparison, BROWSER_MATCHER_ENABLED, BROWSER_MATCHER_REPLACE # ═══════════════════════════════════════════════════════════════════════════════ @@ -273,6 +274,33 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map): 'axis_means': ax_means, }) + # ── A9b — DUAL_MODE : journaliser les décisions browser_matcher vs browser_confidence ── + # Quand BROWSER_MATCHER_REPLACE=true, browser_matcher pilote le bypass à la place. + if BROWSER_MATCHER_ENABLED and 'bm_decision' in unknown_traffic.columns: + log_dual_mode_comparison(unknown_traffic, cycle_id, name) + if BROWSER_MATCHER_REPLACE: + # Appliquer la décision du matcher (remplace le résultat du bloc A9 ci-dessus) + bm_legit = unknown_traffic['bm_decision'] == 'LEGITIMATE_BROWSER' + if bm_legit.any(): + unknown_traffic.loc[bm_legit, 'threat_level'] = 'LEGITIMATE_BROWSER' + unknown_traffic.loc[bm_legit, 'reason'] = ( + '[BrowserMatcher] ' + + unknown_traffic.loc[bm_legit, 'bm_family'].fillna('Unknown') + + ' (score=' + unknown_traffic.loc[bm_legit, 'bm_score'].round(2).astype(str) + ')' + ) + log_info( + f"[{name}][BrowserMatcher] {bm_legit.sum()} bypass(es) appliqué(s) " + f"(BROWSER_MATCHER_REPLACE=true)" + ) + # Atténuation par score partiel pour les zones grises + bm_partial = unknown_traffic['bm_decision'] == 'PARTIAL' + if bm_partial.any(): + partial_scores = unknown_traffic.loc[bm_partial, 'bm_score'].fillna(0.0) + unknown_traffic.loc[bm_partial, 'raw_anomaly_score'] = ( + unknown_traffic.loc[bm_partial, 'raw_anomaly_score'] + * (1 - 0.5 * partial_scores.values) + ) + # Capturer toutes les sessions scorées (avant filtrage par seuil) — pour ml_all_scores all_scored = unknown_traffic.copy() diff --git a/services/bot-detector/bot_detector/preprocessing.py b/services/bot-detector/bot_detector/preprocessing.py index 92c1b99..cb158e0 100644 --- a/services/bot-detector/bot_detector/preprocessing.py +++ b/services/bot-detector/bot_detector/preprocessing.py @@ -9,6 +9,7 @@ import numpy as np from .config import BROWSER_CONFIDENCE_THRESHOLD from .log import log_info from .browser import _compute_browser_axes, _parse_ja4_columns, _infer_browser_family +from .browser_matcher import run_browser_matcher, log_dual_mode_comparison, BROWSER_MATCHER_ENABLED # ═══════════════════════════════════════════════════════════════════════════════ @@ -52,6 +53,10 @@ FEATURES = [ # §2 — Features HTTP/2 (fingerprint SETTINGS, cohérence H2↔JA4) 'h2_settings_known', 'h2_pseudo_order_match', 'h2_ja4_coherence', 'h2_settings_rare', + # §4 — Signaux atomiques H2 pour le browser_matcher (Famille 4 : Cohérence cross-layer) + 'tls_h2_family_mismatch', + 'h2_window_chrome', 'h2_window_firefox', 'h2_window_safari', 'h2_window_absent', + 'h2_order_chromesafari', 'h2_order_firefox', # §3 — Score de cohérence de fingerprint cross-layer 'fingerprint_coherence_score', ] @@ -92,6 +97,13 @@ def preprocess_df(df: pd.DataFrame) -> pd.DataFrame: 'axis_nav_behavior', 'axis_tls_coherence', 'axis_h2_coherence']: df[ax] = browser_axes[ax] + # ── A9b — Browser Signature Matcher (parallèle à browser_confidence) ───── + # En mode DUAL_MODE (BROWSER_MATCHER_REPLACE=false), les colonnes bm_* sont + # ajoutées pour journalisation uniquement — le bypass reste piloté par + # browser_confidence jusqu'à la validation complète. + if BROWSER_MATCHER_ENABLED: + df = run_browser_matcher(df) + # Rétro-compatibilité df['is_known_browser'] = browser_axes['axis_ja4_known'].astype(int) df['browser_consistency_score'] = ( @@ -117,6 +129,10 @@ def preprocess_df(df: pd.DataFrame) -> pd.DataFrame: 'is_fake_navigation', 'has_xff', 'sec_ch_mobile_mismatch', # §2 — Features HTTP/2 binaires 'h2_settings_known', 'h2_pseudo_order_match', 'h2_ja4_coherence', 'h2_settings_rare', + # §4 — Signaux atomiques H2 binaires + 'tls_h2_family_mismatch', + 'h2_window_chrome', 'h2_window_firefox', 'h2_window_safari', 'h2_window_absent', + 'h2_order_chromesafari', 'h2_order_firefox', 'h2_priority_present', } for col in df.columns: if col in binary_features: diff --git a/services/bot-detector/bot_detector/tests/test_browser_matcher.py b/services/bot-detector/bot_detector/tests/test_browser_matcher.py new file mode 100644 index 0000000..6b47557 --- /dev/null +++ b/services/bot-detector/bot_detector/tests/test_browser_matcher.py @@ -0,0 +1,272 @@ +"""Tests unitaires pour browser_matcher.py. + +Vérifie les 6 cas de base : + 1. Chrome full match → famille Chrome, score ≥ 0.72 → LEGITIMATE_BROWSER + 2. Firefox full match → famille Firefox, score ≥ 0.68 → LEGITIMATE_BROWSER + 3. Safari full match → famille Safari, score ≥ 0.68 → LEGITIMATE_BROWSER + 4. curl rejeté → NON_BROWSER (pas de bypass) + 5. httpcloak partial → score < seuil, PARTIAL + 6. TLS↔H2 mismatch → tls_h2_family_mismatch calculé correctement + +Les tests sont auto-contenus : ils construisent des DataFrames pandas +représentant des sessions fictives sans interagir avec ClickHouse. +""" +import pytest +import pandas as pd +import numpy as np + +# Les importations du module principal sont testées directement. +from bot_detector.browser_signatures import BROWSER_SIGNATURES, BROWSER_THRESHOLDS +from bot_detector.browser_matcher import ( + run_browser_matcher, + _compute_family_score, + _matches_non_browser, +) + + +# ───────────────────────────────────────────────────────────────────────────── +# Helpers +# ───────────────────────────────────────────────────────────────────────────── + +def _chrome_session(**overrides) -> pd.DataFrame: + """Session Chrome parfaite (fingerprint complet).""" + base = { + "src_ip": "1.2.3.4", + "ja4": "t13d1517h2_8daaf6152771_e5627efa2ab1", + "browser_family": "Chromium", + "h2_dict_family": "Chrome", + "h2_dict_confidence": 1.0, + "h2_window_update_value": 15663105, + "h2_order_chromesafari": 1, + "h2_order_firefox": 0, + "h2_window_chrome": 1, + "h2_window_firefox": 0, + "h2_window_safari": 0, + "h2_window_absent": 0, + "h2_priority_present": 0, + "h2_settings_known": 1, + "h2_ja4_coherence": 1, + "h2_settings_rare": 0, + "has_accept_language": 1, + "has_sec_ch_ua": 1, + "sec_fetch_absence_rate": 0.0, + "modern_browser_score": 1.0, + "has_xff": 0, + "tls12_ratio": 0.0, + } + base.update(overrides) + return pd.DataFrame([base]) + + +def _firefox_session(**overrides) -> pd.DataFrame: + """Session Firefox parfaite.""" + base = { + "src_ip": "2.3.4.5", + "ja4": "t13d1516h2_8daaf6152771_e5627efa2ab2", + "browser_family": "Firefox", + "h2_dict_family": "Firefox", + "h2_dict_confidence": 1.0, + "h2_window_update_value": 12517377, + "h2_order_chromesafari": 0, + "h2_order_firefox": 1, + "h2_window_chrome": 0, + "h2_window_firefox": 1, + "h2_window_safari": 0, + "h2_window_absent": 0, + "h2_priority_present": 0, + "h2_settings_known": 1, + "h2_ja4_coherence": 1, + "h2_settings_rare": 0, + "has_accept_language": 1, + "has_sec_ch_ua": 0, # Firefox n'envoie pas Sec-CH-UA + "sec_fetch_absence_rate": 0.0, + "modern_browser_score": 1.0, + "has_xff": 0, + "tls12_ratio": 0.0, + } + base.update(overrides) + return pd.DataFrame([base]) + + +def _safari_session(**overrides) -> pd.DataFrame: + """Session Safari parfaite.""" + base = { + "src_ip": "3.4.5.6", + "ja4": "t13d1516h2_8daaf6152771_e5627efa2ab3", + "browser_family": "Safari", + "h2_dict_family": "Safari", + "h2_dict_confidence": 1.0, + "h2_window_update_value": 10485760, + "h2_order_chromesafari": 1, # m,a,s,p partagé avec Chrome + "h2_order_firefox": 0, + "h2_window_chrome": 0, + "h2_window_firefox": 0, + "h2_window_safari": 1, + "h2_window_absent": 0, + "h2_priority_present": 0, + "h2_settings_known": 1, + "h2_ja4_coherence": 1, + "h2_settings_rare": 0, + "has_accept_language": 1, + "has_sec_ch_ua": 0, + "sec_fetch_absence_rate": 1.0, # Safari n'envoie pas Sec-Fetch-* + "modern_browser_score": 0.8, + "has_xff": 0, + "tls12_ratio": 0.0, + } + base.update(overrides) + return pd.DataFrame([base]) + + +def _curl_session(**overrides) -> pd.DataFrame: + """Session curl (WINDOW_UPDATE absent = 0, signature d'outil).""" + base = { + "src_ip": "4.5.6.7", + "ja4": "t13d0606h2_915bc7144b37_curl", + "browser_family": "", + "h2_dict_family": "", + "h2_dict_confidence": 0.0, + "h2_window_update_value": 0, + "h2_order_chromesafari": 0, + "h2_order_firefox": 0, + "h2_window_chrome": 0, + "h2_window_firefox": 0, + "h2_window_safari": 0, + "h2_window_absent": 1, + "h2_priority_present": 0, + "h2_settings_known": 1, # curl envoie des SETTINGS mais pas de WU + "h2_ja4_coherence": 0, + "h2_settings_rare": 1, + "has_accept_language": 0, + "has_sec_ch_ua": 0, + "sec_fetch_absence_rate": 1.0, + "modern_browser_score": 0.1, + "has_xff": 0, + "tls12_ratio": 0.1, + } + base.update(overrides) + return pd.DataFrame([base]) + + +# ───────────────────────────────────────────────────────────────────────────── +# Tests +# ───────────────────────────────────────────────────────────────────────────── + +def test_chrome_full_match(): + """Chrome avec fingerprint parfait : score ≥ 0.72 → LEGITIMATE_BROWSER.""" + df = _chrome_session() + result = run_browser_matcher(df) + assert result.loc[0, "bm_decision"] == "LEGITIMATE_BROWSER" + assert result.loc[0, "bm_family"] == "Chrome" + assert result.loc[0, "bm_score"] >= 0.72 + + +def test_firefox_full_match(): + """Firefox avec fingerprint parfait : score ≥ 0.68 → LEGITIMATE_BROWSER.""" + df = _firefox_session() + result = run_browser_matcher(df) + assert result.loc[0, "bm_decision"] == "LEGITIMATE_BROWSER" + assert result.loc[0, "bm_family"] == "Firefox" + assert result.loc[0, "bm_score"] >= 0.68 + + +def test_safari_full_match(): + """Safari : absence Sec-Fetch + WU=10485760 + dict→Safari → LEGITIMATE_BROWSER.""" + df = _safari_session() + result = run_browser_matcher(df) + assert result.loc[0, "bm_decision"] == "LEGITIMATE_BROWSER" + assert result.loc[0, "bm_family"] == "Safari" + assert result.loc[0, "bm_score"] >= 0.68 + + +def test_curl_rejected(): + """curl : WINDOW_UPDATE absent avec H2 présent → NON_BROWSER (pas de bypass).""" + df = _curl_session() + result = run_browser_matcher(df) + assert result.loc[0, "bm_decision"] == "NON_BROWSER" + assert result.loc[0, "bm_family"] == "" + assert result.loc[0, "bm_non_browser"] is True or result.loc[0, "bm_non_browser"] == True + + +def test_httpcloak_partial(): + """httpcloak : SETTINGS Chrome parfaits MAIS pseudo-header order ≠ Chrome → score partiel.""" + # H2 SETTINGS Chrome (dict renvoie 'Chrome') mais pseudo-order + # inconsistant (h2_order_chromesafari=0 = pas m,a,s,p) + df = _chrome_session( + h2_dict_family="Chrome", + h2_window_update_value=15663105, # Chrome WU ✓ + h2_order_chromesafari=0, # mauvais pseudo-header order + h2_order_firefox=1, # ressemble à Firefox ici + browser_family="", # JA4 inconnu + has_sec_ch_ua=0, # Sec-CH-UA absent (manque un required) + ) + result = run_browser_matcher(df) + # Doit être en dessous du seuil Chrome (0.72) + assert result.loc[0, "bm_score"] < BROWSER_THRESHOLDS["Chrome"] + # Avec des signaux partiels, doit être PARTIAL ou UNKNOWN (pas LEGITIMATE_BROWSER) + assert result.loc[0, "bm_decision"] in ("PARTIAL", "UNKNOWN", "NON_BROWSER") + + +def test_tls_h2_mismatch_detection(): + """JA4 identifie Chrome mais WINDOW_UPDATE = valeur Firefox → mismatch détectable.""" + # Simuler tls_h2_family_mismatch via les colonnes disponibles + # (la colonne SQL est calculée dans la vue, ici on vérifie la logique Python) + chrome_ja4_firefox_wu = { + "browser_family": "Chromium", # JA4 dit Chrome + "h2_window_update_value": 12517377, # WINDOW_UPDATE Firefox + "h2_window_chrome": 0, + "h2_window_firefox": 1, + "h2_window_safari": 0, + "h2_window_absent": 0, + "h2_order_chromesafari": 0, + "h2_order_firefox": 1, + "h2_dict_family": "Firefox", # dict H2 confirme Firefox + "h2_dict_confidence": 1.0, + "h2_settings_known": 1, + "h2_ja4_coherence": 0, # incohérence JA4 vs H2 + "has_accept_language": 1, + "has_sec_ch_ua": 1, + "sec_fetch_absence_rate": 0.0, + "has_xff": 0, + "tls12_ratio": 0.0, + } + df = pd.DataFrame([chrome_ja4_firefox_wu]) + result = run_browser_matcher(df) + + # Le score Chrome doit être bas (h2 ne matche pas Chrome) + chrome_score = _compute_family_score(df, "Chrome").iloc[0] + firefox_score = _compute_family_score(df, "Firefox").iloc[0] + + # Firefox score devrait être supérieur à Chrome (cohérence H2) + assert firefox_score > chrome_score, ( + f"Firefox score ({firefox_score:.3f}) devrait dépasser Chrome ({chrome_score:.3f})" + ) + # La détection de mismatch : la session ne devrait pas être classée Chrome légitime + if result.loc[0, "bm_decision"] == "LEGITIMATE_BROWSER": + assert result.loc[0, "bm_family"] != "Chrome", ( + "Une session avec JA4=Chrome mais H2=Firefox ne devrait pas être classée Chrome légitime" + ) + + +def test_cdn_proxy_neutralizes_h2_settings(): + """Session derrière un CDN (has_xff=1) : dimension H2 SETTINGS neutralisée (0.5).""" + df = _chrome_session( + has_xff=1, # proxy CDN détecté + h2_dict_family="", # le proxy a terminé la connexion H2 — fingerprint perdu + h2_settings_known=0, + ) + result = run_browser_matcher(df) + # Le score peut être inférieur mais la session ne devrait pas être NON_BROWSER + # (le proxy ne rend pas le client malveillant) + assert result.loc[0, "bm_non_browser"] is False or result.loc[0, "bm_non_browser"] == False + + +def test_non_browser_go_net_http(): + """go net/http : WINDOW_UPDATE=1073676289 → NON_BROWSER.""" + df = _curl_session( + h2_window_update_value=1073676289, + h2_window_absent=0, + h2_settings_known=1, + ) + result = run_browser_matcher(df) + assert result.loc[0, "bm_non_browser"] is True or result.loc[0, "bm_non_browser"] == True diff --git a/services/correlator/sql/migrations/06_browser_signature_detection.sql b/services/correlator/sql/migrations/06_browser_signature_detection.sql new file mode 100644 index 0000000..6493d03 --- /dev/null +++ b/services/correlator/sql/migrations/06_browser_signature_detection.sql @@ -0,0 +1,72 @@ +-- === 06_browser_signature_detection.sql === +-- Migration : Browser Signature Detection +-- +-- 1. Création de browser_h2_signatures (table source structurée) +-- 2. Peuplement des fingerprints Akamai de référence +-- 3. Recréation de dict_browser_h2 avec le champ confidence +-- (anciennement chargé depuis CSV — remplacé par la table ClickHouse) +-- +-- Appliquer avec : +-- clickhouse-client --multiquery < 06_browser_signature_detection.sql + +-- --- Table source des signatures H2 --- +CREATE TABLE IF NOT EXISTS ja4_processing.browser_h2_signatures +( + h2_fingerprint String, + browser_family LowCardinality(String), + confidence Float32, + notes String +) +ENGINE = MergeTree() +ORDER BY h2_fingerprint +SETTINGS index_granularity = 128; + +-- Peuplement initial depuis les données historiques (correspondance exacte CSV) +-- Format Akamai : SETTINGS|WINDOW_UPDATE|PRIORITY|PSEUDO_HEADER_ORDER +-- SETTINGS : paires clé:valeur séparées par des virgules +INSERT INTO ja4_processing.browser_h2_signatures VALUES +-- Chrome (Blink + BoringSSL, INITIAL_WINDOW_SIZE=6291456) +('1:65536,2:0,4:6291456,6:262144|15663105|0|m,a,s,p', + 'Chrome', 1.0, 'Chrome 106–142 stable'), +('1:65536,3:1000,4:6291456,6:262144|15663105|0|m,a,s,p', + 'Chrome', 0.95, 'Chrome avec MAX_CONCURRENT_STREAMS=1000 (certains proxies)'), +('1:65536,2:0,3:100,4:6291456,6:262144|15663105|0|m,a,s,p', + 'Chrome', 0.90, 'Chrome avec MAX_CONCURRENT_STREAMS=100'), +-- Firefox (Gecko + NSS, INITIAL_WINDOW_SIZE=131072, MAX_FRAME_SIZE=16384) +('1:65536,4:131072,5:16384|12517377|0|m,p,s,a', + 'Firefox', 1.0, 'Firefox 90–127 stable'), +('1:65536,4:131072|12517377|0|m,p,s,a', + 'Firefox', 0.95, 'Firefox sans MAX_FRAME_SIZE explicite'), +('1:65536,3:100,4:131072,5:16384|12517377|0|m,p,s,a', + 'Firefox', 0.90, 'Firefox avec MAX_CONCURRENT_STREAMS=100'), +-- Safari (WebKit, INITIAL_WINDOW_SIZE=65535) +('1:4096,3:100,4:65535|10485760|0|m,a,s,p', + 'Safari', 1.0, 'Safari 15+ macOS et iOS'), +('1:4096,3:100,4:65535,5:16384|10485760|0|m,a,s,p', + 'Safari', 0.95, 'Safari avec MAX_FRAME_SIZE=16384'), +('1:4096,3:100,4:65535,6:16384|10485760|0|m,a,s,p', + 'Safari', 0.95, 'Safari avec MAX_HEADER_LIST_SIZE=16384'), +-- Edge Chromium (fingerprint H2 identique à Chrome) +('1:65536,2:0,4:6291456,6:262144|15663105|0|m,a,s,p', + 'Edge', 0.9, 'Edge Chromium — fingerprint H2 identique à Chrome (dédoublonné)'); + +-- Recréation du dictionnaire avec le champ confidence +-- (remplace la version CSV-backed sans confidence) +DROP DICTIONARY IF EXISTS ja4_processing.dict_browser_h2; + +CREATE DICTIONARY ja4_processing.dict_browser_h2 +( + h2_fingerprint String, + browser_family String, + confidence Float32 +) +PRIMARY KEY h2_fingerprint +SOURCE(CLICKHOUSE(query ' + SELECT h2_fingerprint, + argMax(browser_family, confidence) AS browser_family, + max(confidence) AS confidence + FROM ja4_processing.browser_h2_signatures + GROUP BY h2_fingerprint +')) +LAYOUT(COMPLEX_KEY_HASHED()) +LIFETIME(MIN 300 MAX 600); diff --git a/shared/clickhouse/07_ai_features_view.sql b/shared/clickhouse/07_ai_features_view.sql index 6e45864..dc729b2 100644 --- a/shared/clickhouse/07_ai_features_view.sql +++ b/shared/clickhouse/07_ai_features_view.sql @@ -122,7 +122,30 @@ WITH base_data AS ( h2_fp != '' AND dictGetOrDefault('ja4_processing.dict_browser_h2', 'browser_family', tuple(h2_fp), '') = '', 1, 0 - ) AS h2_settings_rare + ) AS h2_settings_rare, + -- §4 — Famille identifiée par le dictionnaire H2 (browser_matcher) + dictGetOrDefault('ja4_processing.dict_browser_h2', 'browser_family', + tuple(h2_fp), '') AS h2_dict_family, + dictGetOrDefault('ja4_processing.dict_browser_h2', 'confidence', + tuple(h2_fp), toFloat32(0.0)) AS h2_dict_confidence, + -- §4 — Valeur brute WINDOW_UPDATE H2 (signal de famille le plus fiable) + h2_wu_val AS h2_window_update_value, + -- §4 — Signaux atomiques H2 pour le browser_matcher et le vecteur ML + toUInt8(h2_wu_val BETWEEN 15663000 AND 15664000) AS h2_window_chrome, + toUInt8(h2_wu_val BETWEEN 12517000 AND 12518000) AS h2_window_firefox, + toUInt8(h2_wu_val BETWEEN 10485700 AND 10485820) AS h2_window_safari, + toUInt8(h2_wu_val = 0 AND h2_fp != '') AS h2_window_absent, + -- Chrome et Safari partagent l'ordre m,a,s,p — utiliser WU pour distinguer + toUInt8(h2_pseudo_ord = 'm,a,s,p') AS h2_order_chromesafari, + toUInt8(h2_pseudo_ord = 'm,p,s,a') AS h2_order_firefox, + -- Présence de PRIORITY frames (3e champ de h2_fp, != '0' → Firefox ancien) + toUInt8( + h2_fp != '' + AND length(splitByChar('|', h2_fp)) >= 3 + AND arrayElement(splitByChar('|', h2_fp), 3) NOT IN ('', '0') + ) AS h2_priority_present, + -- Valeur brute du pseudo-header order (pour le matcher Python) + h2_pseudo_ord AS h2_pseudo_ord_raw FROM ( -- Jointure unique avec aliases explicites (contournement bug scope ClickHouse 24.8 -- où PARTITION BY src_ip échoue quand plusieurs sources de JOIN exposent src_ip) @@ -193,9 +216,10 @@ WITH base_data AS ( h.sec_ch_mobile_mismatch AS sec_ch_mobile_mismatch, h.sec_fetch_mode AS sec_fetch_mode, h.sec_fetch_dest AS sec_fetch_dest, - -- colonnes HTTP/2 (defaut vide si pas de trafic H2) + -- colonnes HTTP/2 (defaut vide/0 si pas de trafic H2) COALESCE(h2.h2_fp, '') AS h2_fp, - COALESCE(h2.h2_pseudo_ord, '') AS h2_pseudo_ord + COALESCE(h2.h2_pseudo_ord, '') AS h2_pseudo_ord, + COALESCE(h2.h2_wu_val, 0) AS h2_wu_val FROM ( SELECT window_start, src_ip, ja4, host, src_asn, @@ -258,8 +282,9 @@ WITH base_data AS ( SELECT toStartOfHour(time) AS h2_window, toIPv6(src_ip) AS h2_ip, - anyIf(h2_fingerprint, h2_fingerprint != '') AS h2_fp, - anyIf(h2_pseudo_order, h2_pseudo_order != '') AS h2_pseudo_ord + anyIf(h2_fingerprint, h2_fingerprint != '') AS h2_fp, + anyIf(h2_pseudo_order, h2_pseudo_order != '') AS h2_pseudo_ord, + anyIf(h2_window_update, h2_window_update > 0) AS h2_wu_val FROM ja4_logs.http_logs WHERE time >= now() - INTERVAL 24 HOUR AND (h2_fingerprint != '' OR h2_pseudo_order != '') @@ -271,6 +296,18 @@ SELECT *, -(sum((hits / (total_ip_hits + 1)) * log2((hits / (total_ip_hits + 1)) + 0.000001)) OVER (PARTITION BY src_ip)) AS temporal_entropy, sum(uniq_ja3_per_row) OVER (PARTITION BY src_ip) / greatest(distinct_ja4_count, 1) AS ja3_diversity_ratio, + -- §4 — Incohérence TLS↔H2 : JA4 identifie une famille mais H2 WINDOW_UPDATE en contredit une autre + toUInt8(CASE + WHEN browser_family IN ('Chromium', 'Chrome', 'Edge') + AND h2_window_update_value BETWEEN 12517000 AND 12518000 THEN 1 -- Chrome JA4 / Firefox H2 + WHEN browser_family IN ('Chromium', 'Chrome', 'Edge') + AND h2_window_update_value BETWEEN 10485700 AND 10485820 THEN 1 -- Chrome JA4 / Safari H2 + WHEN browser_family = 'Firefox' + AND h2_window_update_value BETWEEN 15663000 AND 15664000 THEN 1 -- Firefox JA4 / Chrome H2 + WHEN browser_family != '' AND h2_window_update_value = 0 + AND h2_settings_known > 0 THEN 1 -- Navigateur JA4 / pas de WU (outil) + ELSE 0 + END) AS tls_h2_family_mismatch, -- §3 — Score de cohérence de fingerprint cross-layer [0.0, 1.0] toFloat32( CASE WHEN browser_family != '' THEN 0.25 ELSE 0.0 END